aboutsummaryrefslogtreecommitdiff
path: root/lib/Bytecode
diff options
context:
space:
mode:
authorChris Lattner <sabre@nondot.org>2005-06-16 04:55:52 +0000
committerChris Lattner <sabre@nondot.org>2005-06-16 04:55:52 +0000
commitf9c775c22e0ac3eb520d8b4dc89d3f16f274ebb6 (patch)
tree41821f34313e7b5f7a4cdf5976bd88ea0dde156c /lib/Bytecode
parent5adb41a750c9ab6c4d5e3b86a3722c5e562b16a1 (diff)
Fix PR583 and testcase Transforms/InstCombine/2005-06-15-DivSelectCrash.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@22227 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Bytecode')
0 files changed, 0 insertions, 0 deletions
mode'>-rw-r--r--mm/bootmem.c55
-rw-r--r--mm/bounce.c301
-rw-r--r--mm/cleancache.c273
-rw-r--r--mm/compaction.c439
-rw-r--r--mm/dmapool.c31
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/fadvise.c38
-rw-r--r--mm/filemap.c1351
-rw-r--r--mm/filemap_xip.c5
-rw-r--r--mm/fremap.c86
-rw-r--r--mm/frontswap.c169
-rw-r--r--mm/gup.c662
-rw-r--r--mm/huge_memory.c1008
-rw-r--r--mm/hugetlb.c1651
-rw-r--r--mm/hugetlb_cgroup.c115
-rw-r--r--mm/hwpoison-inject.c11
-rw-r--r--mm/internal.h85
-rw-r--r--mm/iov_iter.c743
-rw-r--r--mm/kmemleak-test.c36
-rw-r--r--mm/kmemleak.c200
-rw-r--r--mm/ksm.c810
-rw-r--r--mm/list_lru.c152
-rw-r--r--mm/madvise.c154
-rw-r--r--mm/memblock.c784
-rw-r--r--mm/memcontrol.c3307
-rw-r--r--mm/memory-failure.c524
-rw-r--r--mm/memory.c1648
-rw-r--r--mm/memory_hotplug.c906
-rw-r--r--mm/mempolicy.c475
-rw-r--r--mm/mempool.c14
-rw-r--r--mm/migrate.c618
-rw-r--r--mm/mincore.c30
-rw-r--r--mm/mlock.c461
-rw-r--r--mm/mm_init.c81
-rw-r--r--mm/mmap.c688
-rw-r--r--mm/mmu_context.c6
-rw-r--r--mm/mmu_notifier.c72
-rw-r--r--mm/mmzone.c20
-rw-r--r--mm/mprotect.c161
-rw-r--r--mm/mremap.c70
-rw-r--r--mm/msync.c9
-rw-r--r--mm/nobootmem.c116
-rw-r--r--mm/nommu.c192
-rw-r--r--mm/oom_kill.c90
-rw-r--r--mm/page-writeback.c421
-rw-r--r--mm/page_alloc.c1530
-rw-r--r--mm/page_cgroup.c19
-rw-r--r--mm/page_io.c143
-rw-r--r--mm/page_isolation.c22
-rw-r--r--mm/pagewalk.c72
-rw-r--r--mm/percpu.c257
-rw-r--r--mm/pgtable-generic.c53
-rw-r--r--mm/process_vm_access.c280
-rw-r--r--mm/readahead.c73
-rw-r--r--mm/rmap.c769
-rw-r--r--mm/shmem.c485
-rw-r--r--mm/slab.c1622
-rw-r--r--mm/slab.h148
-rw-r--r--mm/slab_common.c499
-rw-r--r--mm/slob.c47
-rw-r--r--mm/slub.c933
-rw-r--r--mm/sparse-vmemmap.c33
-rw-r--r--mm/sparse.c293
-rw-r--r--mm/swap.c576
-rw-r--r--mm/swap_state.c167
-rw-r--r--mm/swapfile.c974
-rw-r--r--mm/truncate.c274
-rw-r--r--mm/util.c135
-rw-r--r--mm/vmacache.c132
-rw-r--r--mm/vmalloc.c488
-rw-r--r--mm/vmpressure.c380
-rw-r--r--mm/vmscan.c1555
-rw-r--r--mm/vmstat.c158
-rw-r--r--mm/workingset.c414
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zsmalloc.c1117
-rw-r--r--mm/zswap.c940
81 files changed, 11979 insertions, 22965 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e9977a9d65..278e3ab1f16 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,6 +1,6 @@
config SELECT_MEMORY_MODEL
def_bool y
- depends on ARCH_SELECT_MEMORY_MODEL
+ depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
choice
prompt "Memory model"
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL
Some users of more advanced features like NUMA and
memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
+ DISCONTIGMEM is an more mature, better tested system,
but is incompatible with memory hotplug and may suffer
decreased performance over SPARSEMEM. If unsure between
"Sparse Memory" and "Discontiguous Memory", choose
@@ -134,9 +134,6 @@ config HAVE_MEMBLOCK
config HAVE_MEMBLOCK_NODE_MAP
boolean
-config HAVE_MEMBLOCK_PHYS_MAP
- boolean
-
config ARCH_DISCARD_MEMBLOCK
boolean
@@ -156,34 +153,21 @@ config MOVABLE_NODE
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows the following
- two things:
- - When the system is booting, node full of hotpluggable memory can
- be arranged to have only movable memory so that the whole node can
- be hot-removed. (need movable_node boot option specified).
- - After the system is up, the option allows users to online all the
- memory of a node as movable memory so that the whole node can be
- hot-removed.
-
- Users who don't use the memory hotplug feature are fine with this
- option on since they don't specify movable_node boot option or they
- don't online memory as movable.
+ memory device cannot be hotplugged. This option allows users to
+ online all the memory of a node as movable memory so that the whole
+ node can be hotplugged. Users who don't use the memory hotplug
+ feature are fine with this option on since they don't online memory
+ as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
-#
-# Only be set on architectures that have completely implemented memory hotplug
-# feature. If you are not sure, don't touch it.
-#
-config HAVE_BOOTMEM_INFO_NODE
- def_bool n
-
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
- depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
@@ -192,8 +176,6 @@ config MEMORY_HOTPLUG_SPARSE
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
- select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
@@ -219,14 +201,11 @@ config PAGEFLAGS_EXTENDED
#
config SPLIT_PTLOCK_CPUS
int
- default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
+ default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
default "4"
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- boolean
-
#
# support for memory balloon compaction
config BALLOON_COMPACTION
@@ -258,7 +237,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
- depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
@@ -267,9 +246,6 @@ config MIGRATION
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
-config ARCH_ENABLE_HUGEPAGE_MIGRATION
- boolean
-
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -279,27 +255,8 @@ config ZONE_DMA_FLAG
default "1"
config BOUNCE
- bool "Enable bounce buffers"
- default y
+ def_bool y
depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
- help
- Enable bounce buffers for devices that cannot access
- the full range of memory available to the CPU. Enabled
- by default when ZONE_DMA or HIGHMEM is selected, but you
- may say n to override this.
-
-# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
-# have more than 4GB of memory, but we don't currently use the IOTLB to present
-# a 32-bit address to OHCI. So we need to use a bounce pool instead.
-#
-# We also use the bounce pool to provide stable page writes for jbd. jbd
-# initiates buffer writeback without locking the page or setting PG_writeback,
-# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
-# a major rework effort. Instead, use the bounce buffer to snapshot pages
-# (until jbd goes away). The only jbd user is ext3.
-config NEED_BOUNCE_POOL
- bool
- default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
config NR_QUICK
int
@@ -308,12 +265,8 @@ config NR_QUICK
default "1"
config VIRT_TO_BUS
- bool
- help
- An architecture should select this if it implements the
- deprecated interface virt_to_bus(). All new architectures
- should probably not select this.
-
+ def_bool y
+ depends on !ARCH_NO_VIRT_TO_BUS
config MMU_NOTIFIER
bool
@@ -436,6 +389,16 @@ choice
benefit.
endchoice
+config CROSS_MEMORY_ATTACH
+ bool "Cross Memory Support"
+ depends on MMU
+ default y
+ help
+ Enabling this option adds the system calls process_vm_readv and
+ process_vm_writev which allow a process with the correct privileges
+ to directly read from or write to to another process's address space.
+ See the man page for more details.
+
#
# UP and nommu archs use km based percpu allocator
#
@@ -483,112 +446,3 @@ config FRONTSWAP
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
-
-config CMA
- bool "Contiguous Memory Allocator"
- depends on HAVE_MEMBLOCK && MMU
- select MIGRATION
- select MEMORY_ISOLATION
- help
- This enables the Contiguous Memory Allocator which allows other
- subsystems to allocate big physically-contiguous blocks of memory.
- CMA reserves a region of memory and allows only movable pages to
- be allocated from it. This way, the kernel can use the memory for
- pagecache and when a subsystem requests for contiguous area, the
- allocated pages are migrated away to serve the contiguous request.
-
- If unsure, say "n".
-
-config CMA_DEBUG
- bool "CMA debug messages (DEVELOPMENT)"
- depends on DEBUG_KERNEL && CMA
- help
- Turns on debug messages in CMA. This produces KERN_DEBUG
- messages for every CMA call as well as various messages while
- processing calls such as dma_alloc_from_contiguous().
- This option does not affect warning and error messages.
-
-config ZBUD
- tristate
- default n
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
- depends on FRONTSWAP && CRYPTO=y
- select CRYPTO_LZO
- select ZBUD
- default n
- help
- A lightweight compressed cache for swap pages. It takes
- pages that are in the process of being swapped out and attempts to
- compress them into a dynamically allocated RAM-based memory pool.
- This can result in a significant I/O reduction on swap device and,
- in the case where decompressing from RAM is faster that swap device
- reads, can also improve workload performance.
-
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
-config MEM_SOFT_DIRTY
- bool "Track memory changes"
- depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
- select PROC_PAGE_MONITOR
- help
- This option enables memory changes tracking by introducing a
- soft-dirty bit on pte-s. This bit it set when someone writes
- into a page just as regular dirty bit, but unlike the latter
- it can be cleared by hands.
-
- See Documentation/vm/soft-dirty.txt for more details.
-
-config ZSMALLOC
- tristate "Memory allocator for compressed pages"
- depends on MMU
- default n
- help
- zsmalloc is a slab-based memory allocator designed to store
- compressed RAM pages. zsmalloc uses virtual memory mapping
- in order to reduce fragmentation. However, this results in a
- non-standard allocator interface where a handle, not a pointer, is
- returned by an alloc(). This handle must be mapped in order to
- access the allocated space.
-
-config PGTABLE_MAPPING
- bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC
- help
- By default, zsmalloc uses a copy-based object mapping method to
- access allocations that span two pages. However, if a particular
- architecture (ex, ARM) performs VM mapping faster than copying,
- then you should select this. This causes zsmalloc to use page table
- mapping rather than copying for object mapping.
-
- You can check speed with zsmalloc benchmark:
- https://github.com/spartacus06/zsmapbench
-
-config GENERIC_EARLY_IOREMAP
- bool
-
-config MAX_STACK_SIZE_MB
- int "Maximum user stack size for 32-bit processes (MB)"
- default 80
- range 8 256 if METAG
- range 8 2048
- depends on STACK_GROWSUP && (!64BIT || COMPAT)
- help
- This is the maximum stack size in Megabytes in the VM layout of 32-bit
- user processes when the stack grows upwards (currently only on parisc
- and metag arch). The stack will be located at the highest memory
- address minus the given value, unless the RLIMIT_STACK hard limit is
- changed to a smaller value in which case that is used.
-
- A sane initial value is 80 MB.
diff --git a/mm/Makefile b/mm/Makefile
index 4064f3ec145..3a4628751f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
#
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
@@ -16,9 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o vmacache.o \
- interval_tree.o list_lru.o workingset.o \
- iov_iter.o $(mmu-y)
+ compaction.o balloon_compaction.o \
+ interval_tree.o $(mmu-y)
obj-y += init-mm.o
@@ -30,9 +29,9 @@ endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
-obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
@@ -51,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
@@ -59,6 +58,3 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
-obj-$(CONFIG_ZBUD) += zbud.o
-obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
-obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1706cbbdf5f..d3ca2b3ee17 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,14 +31,13 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
+ * reader side protection for bdi_pending_list. bdi_list has RCU reader side
* locking.
*/
DEFINE_SPINLOCK(bdi_lock);
LIST_HEAD(bdi_list);
-
-/* bdi_wq serves all asynchronous writeback tasks */
-struct workqueue_struct *bdi_wq;
+LIST_HEAD(bdi_pending_list);
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
@@ -180,8 +179,7 @@ static ssize_t name##_show(struct device *dev, \
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
\
return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
-} \
-static DEVICE_ATTR_RW(name);
+}
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
@@ -223,25 +221,14 @@ static ssize_t max_ratio_store(struct device *dev,
}
BDI_SHOW(max_ratio, bdi->max_ratio)
-static ssize_t stable_pages_required_show(struct device *dev,
- struct device_attribute *attr,
- char *page)
-{
- struct backing_dev_info *bdi = dev_get_drvdata(dev);
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
- return snprintf(page, PAGE_SIZE-1, "%d\n",
- bdi_cap_stable_pages_required(bdi) ? 1 : 0);
-}
-static DEVICE_ATTR_RO(stable_pages_required);
-
-static struct attribute *bdi_dev_attrs[] = {
- &dev_attr_read_ahead_kb.attr,
- &dev_attr_min_ratio.attr,
- &dev_attr_max_ratio.attr,
- &dev_attr_stable_pages_required.attr,
- NULL,
+static struct device_attribute bdi_dev_attrs[] = {
+ __ATTR_RW(read_ahead_kb),
+ __ATTR_RW(min_ratio),
+ __ATTR_RW(max_ratio),
+ __ATTR_NULL,
};
-ATTRIBUTE_GROUPS(bdi_dev);
static __init int bdi_class_init(void)
{
@@ -249,7 +236,7 @@ static __init int bdi_class_init(void)
if (IS_ERR(bdi_class))
return PTR_ERR(bdi_class);
- bdi_class->dev_groups = bdi_dev_groups;
+ bdi_class->dev_attrs = bdi_dev_attrs;
bdi_debug_init();
return 0;
}
@@ -259,11 +246,6 @@ static int __init default_bdi_init(void)
{
int err;
- bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
- if (!bdi_wq)
- return -ENOMEM;
-
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
@@ -278,6 +260,26 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
return wb_has_dirty_io(&bdi->wb);
}
+static void wakeup_timer_fn(unsigned long data)
+{
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else if (bdi->dev) {
+ /*
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
+ */
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
/*
* This function is used when the first inode for this bdi is marked dirty. It
* wakes-up the corresponding bdi thread which should then take care of the
@@ -288,19 +290,182 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
* Note, we wouldn't bother setting up the timer, but this function is on the
* fast-path (used by '__mark_inode_dirty()'), so we save few context switches
* by delaying the wake-up.
- *
- * We have to be careful not to postpone flush work if it is scheduled for
- * earlier. Thus we use queue_delayed_work().
*/
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
{
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
- spin_unlock_bh(&bdi->wb_lock);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
+
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
+}
+
+static int bdi_forker_thread(void *ptr)
+{
+ struct bdi_writeback *me = ptr;
+
+ current->flags |= PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
+
+ for (;;) {
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
+
+ /*
+ * Temporary measure, we want to make sure we don't see
+ * dirty data on the default backing_dev_info
+ */
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
+ wb_do_writeback(me, 0);
+ }
+
+ spin_lock_bh(&bdi_lock);
+ /*
+ * In the following loop we are going to check whether we have
+ * some work to do without any synchronization with tasks
+ * waking us up to do work for them. Set the task state here
+ * so that we don't miss wakeups after verifying conditions.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
+
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
+ continue;
+
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
+
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
+
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
+
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_create(bdi_writeback_thread, &bdi->wb,
+ "flush-%s", dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
+ */
+ writeback_inodes_wb(&bdi->wb, 1024,
+ WB_REASON_FORKER_THREAD);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ * And as soon as the bdi thread is visible, we
+ * can start it.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ wake_up_process(task);
+ }
+ bdi_clear_pending(bdi);
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ bdi_clear_pending(bdi);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
+ else
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+ try_to_freeze();
+ break;
+ }
+ }
+
+ return 0;
}
/*
@@ -332,6 +497,20 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
bdi->dev = dev;
+ /*
+ * Just start the forker thread for our default backing_dev_info,
+ * and add other bdi's to the list. They will get a thread created
+ * on-demand when they need it.
+ */
+ if (bdi_cap_flush_forker(bdi)) {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
+ dev_name(dev));
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
+ }
+
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
@@ -355,6 +534,8 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
+ struct task_struct *task;
+
if (!bdi_cap_writeback_dirty(bdi))
return;
@@ -363,26 +544,23 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
*/
bdi_remove_from_list(bdi);
- /* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- clear_bit(BDI_registered, &bdi->state);
- spin_unlock_bh(&bdi->wb_lock);
-
/*
- * Drain work list and shutdown the delayed_work. At this point,
- * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
- * is dying and its work_list needs to be drained no matter what.
+ * If setup is pending, wait for that to complete first
*/
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- flush_delayed_work(&bdi->wb.dwork);
- WARN_ON(!list_empty(&bdi->work_list));
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
/*
- * This shouldn't be necessary unless @bdi for some reason has
- * unflushed dirty IO after work_list is drained. Do it anyway
- * just in case.
+ * Finally, kill the kernel thread. We don't need to be RCU
+ * safe anymore, since the bdi is gone from visibility.
*/
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ spin_lock_bh(&bdi->wb_lock);
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock_bh(&bdi->wb_lock);
+
+ if (task)
+ kthread_stop(task);
}
/*
@@ -408,8 +586,10 @@ void bdi_unregister(struct backing_dev_info *bdi)
bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
- bdi_wb_shutdown(bdi);
+ if (!bdi_cap_flush_forker(bdi))
+ bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
spin_lock_bh(&bdi->wb_lock);
@@ -431,7 +611,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
spin_lock_init(&wb->list_lock);
- INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
/*
@@ -504,11 +684,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
/*
- * If bdi_unregister() had already been called earlier, the dwork
- * could still be pending because bdi_prune_sb() can race with the
- * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
+ * If bdi_unregister() had already been called earlier, the
+ * wakeup_timer could still be armed because bdi_prune_sb()
+ * can race with the bdi_wakeup_thread_delayed() calls from
+ * __mark_inode_dirty().
*/
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ del_timer_sync(&bdi->wb.wakeup_timer);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
@@ -524,6 +705,7 @@ EXPORT_SYMBOL(bdi_destroy);
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
unsigned int cap)
{
+ char tmp[32];
int err;
bdi->name = name;
@@ -532,8 +714,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
if (err)
return err;
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
+ sprintf(tmp, "%.28s%s", name, "-%d");
+ err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
if (err) {
bdi_destroy(bdi);
return err;
@@ -557,7 +739,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
bit = sync ? BDI_sync_congested : BDI_async_congested;
if (test_and_clear_bit(bit, &bdi->state))
atomic_dec(&nr_bdi_congested[sync]);
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
}
@@ -660,7 +842,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
{
char kbuf[] = "0\n";
- if (*ppos || *lenp < sizeof(kbuf)) {
+ if (*ppos) {
*lenp = 0;
return 0;
}
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf..07dbc8ec46c 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -267,7 +267,7 @@ void balloon_page_putback(struct page *page)
put_page(page);
} else {
WARN_ON(1);
- dump_page(page, "not movable balloon page");
+ dump_page(page);
}
unlock_page(page);
}
@@ -287,7 +287,7 @@ int balloon_page_migrate(struct page *newpage,
BUG_ON(!trylock_page(newpage));
if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page, "not movable balloon page");
+ dump_page(page);
unlock_page(newpage);
return rc;
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd3507b41..b93376c39b6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,12 +172,11 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long *map, start, end, pages, count = 0;
+ unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
- map = bdata->node_bootmem_map;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
@@ -185,9 +184,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdata - bootmem_node_data, start, end);
while (start < end) {
- unsigned long idx, vec;
+ unsigned long *map, idx, vec;
unsigned shift;
+ map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
@@ -241,26 +241,33 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
-static int reset_managed_pages_done __initdata;
-
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
-
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->managed_pages = 0;
+ if (!is_highmem(z))
+ z->managed_pages = 0;
}
-void __init reset_all_zones_managed_pages(void)
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
- struct pglist_data *pgdat;
-
- for_each_online_pgdat(pgdat)
- reset_node_managed_pages(pgdat);
- reset_managed_pages_done = 1;
+ register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
+ return free_all_bootmem_core(pgdat->bdata);
}
/**
@@ -272,14 +279,14 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
+ struct pglist_data *pgdat;
- reset_all_zones_managed_pages();
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
- totalram_pages += total_pages;
-
return total_pages;
}
@@ -784,7 +791,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
/* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat_end_pfn(pgdat);
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
(goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
@@ -826,14 +833,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
}
-void * __init __alloc_bootmem_low_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem_nopanic(size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
-}
-
/**
* __alloc_bootmem_low_node - allocate low boot memory from a specific node
* @pgdat: node to allocate from
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 00000000000..04208677556
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,301 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <asm/tlbflush.h>
+
+#include <trace/events/block.h>
+
+#define POOL_SIZE 64
+#define ISA_POOL_SIZE 16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
+static __init int init_emergency_pool(void)
+{
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
+ if (max_pfn <= max_low_pfn)
+ return 0;
+#endif
+
+ page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+ BUG_ON(!page_pool);
+ printk("bounce pool size: %d pages\n", POOL_SIZE);
+
+ return 0;
+}
+
+__initcall(init_emergency_pool);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+ unsigned long flags;
+ unsigned char *vto;
+
+ local_irq_save(flags);
+ vto = kmap_atomic(to->bv_page);
+ memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+ kunmap_atomic(vto);
+ local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom) \
+ memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+ return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+ if (isa_page_pool)
+ return 0;
+
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+ mempool_free_pages, (void *) 0);
+ BUG_ON(!isa_page_pool);
+
+ printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+ return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+ unsigned char *vfrom;
+ struct bio_vec *tovec, *fromvec;
+ int i;
+
+ __bio_for_each_segment(tovec, to, i, 0) {
+ fromvec = from->bi_io_vec + i;
+
+ /*
+ * not bounced
+ */
+ if (tovec->bv_page == fromvec->bv_page)
+ continue;
+
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have been
+ * modified by the block layer, so use the original copy,
+ * bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+ bounce_copy_vec(tovec, vfrom);
+ flush_dcache_page(tovec->bv_page);
+ }
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+ struct bio_vec *bvec, *org_vec;
+ int i;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+ set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+ /*
+ * free up bounce indirect pages used
+ */
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ org_vec = bio_orig->bi_io_vec + i;
+ if (bvec->bv_page == org_vec->bv_page)
+ continue;
+
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+
+ bio_endio(bio_orig, err);
+ bio_put(bio);
+}
+
+static void bounce_end_io_write(struct bio *bio, int err)
+{
+ bounce_end_io(bio, page_pool, err);
+}
+
+static void bounce_end_io_write_isa(struct bio *bio, int err)
+{
+
+ bounce_end_io(bio, isa_page_pool, err);
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+ struct bio *bio_orig = bio->bi_private;
+
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ copy_to_high_bio_irq(bio_orig, bio);
+
+ bounce_end_io(bio, pool, err);
+}
+
+static void bounce_end_io_read(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, page_pool, err);
+}
+
+static void bounce_end_io_read_isa(struct bio *bio, int err)
+{
+ __bounce_end_io_read(bio, isa_page_pool, err);
+}
+
+static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
+ mempool_t *pool)
+{
+ struct page *page;
+ struct bio *bio = NULL;
+ int i, rw = bio_data_dir(*bio_orig);
+ struct bio_vec *to, *from;
+
+ bio_for_each_segment(from, *bio_orig, i) {
+ page = from->bv_page;
+
+ /*
+ * is destination page below bounce pfn?
+ */
+ if (page_to_pfn(page) <= queue_bounce_pfn(q))
+ continue;
+
+ /*
+ * irk, bounce it
+ */
+ if (!bio) {
+ unsigned int cnt = (*bio_orig)->bi_vcnt;
+
+ bio = bio_alloc(GFP_NOIO, cnt);
+ memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
+ }
+
+
+ to = bio->bi_io_vec + i;
+
+ to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+ if (rw == WRITE) {
+ char *vto, *vfrom;
+
+ flush_dcache_page(from->bv_page);
+ vto = page_address(to->bv_page) + to->bv_offset;
+ vfrom = kmap(from->bv_page) + from->bv_offset;
+ memcpy(vto, vfrom, to->bv_len);
+ kunmap(from->bv_page);
+ }
+ }
+
+ /*
+ * no pages bounced
+ */
+ if (!bio)
+ return;
+
+ trace_block_bio_bounce(q, *bio_orig);
+
+ /*
+ * at least one page was bounced, fill in possible non-highmem
+ * pages
+ */
+ __bio_for_each_segment(from, *bio_orig, i, 0) {
+ to = bio_iovec_idx(bio, i);
+ if (!to->bv_page) {
+ to->bv_page = from->bv_page;
+ to->bv_len = from->bv_len;
+ to->bv_offset = from->bv_offset;
+ }
+ }
+
+ bio->bi_bdev = (*bio_orig)->bi_bdev;
+ bio->bi_flags |= (1 << BIO_BOUNCED);
+ bio->bi_sector = (*bio_orig)->bi_sector;
+ bio->bi_rw = (*bio_orig)->bi_rw;
+
+ bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+ bio->bi_idx = (*bio_orig)->bi_idx;
+ bio->bi_size = (*bio_orig)->bi_size;
+
+ if (pool == page_pool) {
+ bio->bi_end_io = bounce_end_io_write;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read;
+ } else {
+ bio->bi_end_io = bounce_end_io_write_isa;
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read_isa;
+ }
+
+ bio->bi_private = *bio_orig;
+ *bio_orig = bio;
+}
+
+void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+{
+ mempool_t *pool;
+
+ /*
+ * Data-less bio, nothing to bounce
+ */
+ if (!bio_has_data(*bio_orig))
+ return;
+
+ /*
+ * for non-isa bounce case, just check if the bounce pfn is equal
+ * to or bigger than the highest pfn in the system -- in that case,
+ * don't waste time iterating over bio segments
+ */
+ if (!(q->bounce_gfp & GFP_DMA)) {
+ if (queue_bounce_pfn(q) >= blk_max_pfn)
+ return;
+ pool = page_pool;
+ } else {
+ BUG_ON(!isa_page_pool);
+ pool = isa_page_pool;
+ }
+
+ /*
+ * slow path
+ */
+ __blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index d0eac435040..32e6f4136fa 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,10 +19,20 @@
#include <linux/cleancache.h>
/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/invalidate even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled __read_mostly;
+EXPORT_SYMBOL(cleancache_enabled);
+
+/*
* cleancache_ops is set by cleancache_ops_register to contain the pointers
* to the cleancache "backend" implementation functions.
*/
-static struct cleancache_ops *cleancache_ops __read_mostly;
+static struct cleancache_ops cleancache_ops __read_mostly;
/*
* Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -35,101 +45,15 @@ static u64 cleancache_puts;
static u64 cleancache_invalidates;
/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- * a) mount / -> __cleancache_init_fs is called. We set the
- * [shared_|]fs_poolid_map and uuids for.
- *
- * b). user does I/Os -> we call the rest of __cleancache_* functions
- * which return immediately as cleancache_ops is false.
- *
- * c). modprobe zcache -> cleancache_register_ops. We init the backend
- * and set cleancache_ops to true, and for any fs_poolid_map
- * (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- * d). user does I/Os -> now that cleancache_ops is true all the
- * __cleancache_* functions can call the backend. They all check
- * that fs_poolid_map is valid and if so invoke the backend.
- *
- * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
- * reset (which is the second check in the __cleancache_* ops
- * to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
-
-/*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
*/
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
{
- struct cleancache_ops *old = cleancache_ops;
- int i;
+ struct cleancache_ops old = cleancache_ops;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_NO_BACKEND)
- fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
- if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
- shared_fs_poolid_map[i] = ops->init_shared_fs
- (uuids[i], PAGE_SIZE);
- }
- /*
- * We MUST set cleancache_ops _after_ we have called the backends
- * init_fs or init_shared_fs functions. Otherwise the compiler might
- * re-order where cleancache_ops is set in this function.
- */
- barrier();
- cleancache_ops = ops;
- mutex_unlock(&poolid_mutex);
+ cleancache_ops = *ops;
+ cleancache_enabled = 1;
return old;
}
EXPORT_SYMBOL(cleancache_register_ops);
@@ -137,42 +61,15 @@ EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
- int i;
-
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
- if (cleancache_ops)
- fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
- else
- fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
- }
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
{
- int i;
-
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
- uuids[i] = uuid;
- if (cleancache_ops)
- shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
- (uuid, PAGE_SIZE);
- else
- shared_fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
- }
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid =
+ (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -192,7 +89,7 @@ static int cleancache_get_key(struct inode *inode,
fhfn = sb->s_export_op->encode_fh;
if (fhfn) {
len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
- if (len <= FILEID_ROOT || len == FILEID_INVALID)
+ if (len <= 0 || len == 255)
return -1;
if (maxlen > CLEANCACHE_KEY_MAX)
return -1;
@@ -202,53 +99,27 @@ static int cleancache_get_key(struct inode *inode,
}
/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
- return shared_fs_poolid_map[fake_pool_id -
- FAKE_SHARED_FS_POOLID_OFFSET];
- else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
- return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
- return FS_NO_BACKEND;
-}
-
-/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
* successful, use it to fill the specified page with data and return 0.
* The pageframe is unchanged and returns -1 if the get fails.
* Page must be locked by caller.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
*/
int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
- if (!cleancache_ops) {
- cleancache_failed_gets++;
- goto out;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
goto out;
- pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
- if (pool_id >= 0)
- ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
@@ -263,32 +134,17 @@ EXPORT_SYMBOL(__cleancache_get_page);
* (previously-obtained per-filesystem) poolid and the page's,
* inode and page index. Page must be locked. Note that a put_page
* always "succeeds", though a subsequent get_page may succeed or fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
*/
void __cleancache_put_page(struct page *page)
{
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
- if (!cleancache_ops) {
- cleancache_puts++;
- return;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
+ VM_BUG_ON(!PageLocked(page));
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
- cleancache_get_key(page->mapping->host, &key) >= 0) {
- cleancache_ops->put_page(pool_id, key, page->index, page);
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ (*cleancache_ops.put_page)(pool_id, key, page->index, page);
cleancache_puts++;
}
}
@@ -297,31 +153,19 @@ EXPORT_SYMBOL(__cleancache_put_page);
/*
* Invalidate any data from cleancache associated with the poolid and the
* page's inode and page index so that a subsequent "get" will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
*/
void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
- if (!cleancache_ops)
- return;
-
- if (fake_pool_id >= 0) {
- pool_id = get_poolid_from_fake(fake_pool_id);
- if (pool_id < 0)
- return;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (pool_id >= 0) {
+ VM_BUG_ON(!PageLocked(page));
if (cleancache_get_key(mapping->host, &key) >= 0) {
- cleancache_ops->invalidate_page(pool_id,
- key, page->index);
+ (*cleancache_ops.invalidate_page)(pool_id,
+ key, page->index);
cleancache_invalidates++;
}
}
@@ -332,63 +176,34 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
* Invalidate all data from cleancache associated with the poolid and the
* mappings's inode so that all subsequent gets to this poolid/inode
* will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
- if (!cleancache_ops)
- return;
-
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
- cleancache_ops->invalidate_inode(pool_id, key);
+ (*cleancache_ops.invalidate_inode)(pool_id, key);
}
EXPORT_SYMBOL(__cleancache_invalidate_inode);
/*
* Called by any cleancache-enabled filesystem at time of unmount;
- * note that pool_id is surrendered and may be returned by a subsequent
- * cleancache_init_fs or cleancache_init_shared_fs.
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
- int index;
- int fake_pool_id = sb->cleancache_poolid;
- int old_poolid = fake_pool_id;
-
- mutex_lock(&poolid_mutex);
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
- old_poolid = shared_fs_poolid_map[index];
- shared_fs_poolid_map[index] = FS_UNKNOWN;
- uuids[index] = NULL;
- } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
- old_poolid = fs_poolid_map[index];
- fs_poolid_map[index] = FS_UNKNOWN;
+ if (sb->cleancache_poolid >= 0) {
+ int old_poolid = sb->cleancache_poolid;
+ sb->cleancache_poolid = -1;
+ (*cleancache_ops.invalidate_fs)(old_poolid);
}
- sb->cleancache_poolid = -1;
- if (cleancache_ops)
- cleancache_ops->invalidate_fs(old_poolid);
- mutex_unlock(&poolid_mutex);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
- int i;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
@@ -400,10 +215,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- fs_poolid_map[i] = FS_UNKNOWN;
- shared_fs_poolid_map[i] = FS_UNKNOWN;
- }
return 0;
}
module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642..c62bd063d76 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,7 +15,6 @@
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include <linux/balloon_compaction.h>
-#include <linux/page-isolation.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -86,11 +85,10 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void __reset_isolation_suitable(struct zone *zone)
{
unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
unsigned long pfn;
- zone->compact_cached_migrate_pfn[0] = start_pfn;
- zone->compact_cached_migrate_pfn[1] = start_pfn;
+ zone->compact_cached_migrate_pfn = start_pfn;
zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
@@ -132,43 +130,26 @@ void reset_isolation_suitable(pg_data_t *pgdat)
*/
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
-
- if (cc->ignore_skip_hint)
- return;
-
if (!page)
return;
- if (nr_isolated)
- return;
-
- /*
- * Only skip pageblocks when all forms of compaction will be known to
- * fail in the near future.
- */
- if (set_unsuitable)
+ if (!nr_isolated) {
+ unsigned long pfn = page_to_pfn(page);
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
- /* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (cc->finished_update_migrate)
- return;
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (cc->finished_update_free)
- return;
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
+ /* Update where compaction should restart */
+ if (migrate_scanner) {
+ if (!cc->finished_update_migrate &&
+ pfn > zone->compact_cached_migrate_pfn)
+ zone->compact_cached_migrate_pfn = pfn;
+ } else {
+ if (!cc->finished_update_free &&
+ pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+ }
}
}
#else
@@ -180,7 +161,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
}
#endif /* CONFIG_COMPACTION */
@@ -209,7 +190,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
}
/* async aborts if taking too long or contended */
- if (cc->mode == MIGRATE_ASYNC) {
+ if (!cc->sync) {
cc->contended = true;
return false;
}
@@ -222,39 +203,27 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
return true;
}
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_checklock_irqsave() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+ unsigned long *flags, struct compact_control *cc)
{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
-
- return false;
+ return compact_checklock_irqsave(lock, flags, false, cc);
}
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
- /* If the page is a large free page, then disallow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
return false;
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (migrate_async_suitable(migratetype))
return true;
/* Otherwise skip the block */
@@ -262,9 +231,10 @@ static bool suitable_migration_target(struct page *page)
}
/*
- * Isolate free pages onto a private freelist. If @strict is true, will abort
- * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
- * (even though it may still end up isolating some pages).
+ * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * pages inside of the pageblock (even though it may still end up isolating
+ * some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long blockpfn,
@@ -274,9 +244,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
+ unsigned long nr_strict_required = end_pfn - blockpfn;
unsigned long flags;
bool locked = false;
- bool checked_pageblock = false;
cursor = pfn_to_page(blockpfn);
@@ -287,12 +257,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
nr_scanned++;
if (!pfn_valid_within(blockpfn))
- goto isolate_fail;
-
+ continue;
if (!valid_page)
valid_page = page;
if (!PageBuddy(page))
- goto isolate_fail;
+ continue;
/*
* The zone lock must be held to isolate freepages.
@@ -308,23 +277,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
/* Recheck this is a suitable migration target under lock */
- if (!strict && !checked_pageblock) {
- /*
- * We need to check suitability of pageblock only once
- * and this isolate_freepages_block() is called with
- * pageblock range, so just check once is sufficient.
- */
- checked_pageblock = true;
- if (!suitable_migration_target(page))
- break;
- }
+ if (!strict && !suitable_migration_target(page))
+ break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
- goto isolate_fail;
+ continue;
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
+ if (!isolated && strict)
+ break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -335,15 +298,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (isolated) {
blockpfn += isolated - 1;
cursor += isolated - 1;
- continue;
}
-
-isolate_fail:
- if (strict)
- break;
- else
- continue;
-
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
@@ -353,7 +308,7 @@ isolate_fail:
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
- if (strict && blockpfn < end_pfn)
+ if (strict && nr_strict_required > total_isolated)
total_isolated = 0;
if (locked)
@@ -361,8 +316,7 @@ isolate_fail:
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, true,
- false);
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
@@ -493,14 +447,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
- bool set_unsuitable = true;
- const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
- ISOLATE_ASYNC_MIGRATE : 0) |
- (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -509,7 +460,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
- if (cc->mode == MIGRATE_ASYNC)
+ if (!cc->sync)
return 0;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -518,13 +469,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
return 0;
}
- if (compact_should_abort(cc))
- return 0;
-
/* Time to isolate some pages for migration */
+ cond_resched();
for (; low_pfn < end_pfn; low_pfn++) {
/* give a chance to irqs before checking need_resched() */
- if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
if (should_release_lock(&zone->lru_lock)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
locked = false;
@@ -563,32 +512,23 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* If isolation recently failed, do not retry */
pageblock_nr = low_pfn >> pageblock_order;
- if (last_pageblock_nr != pageblock_nr) {
- int mt;
-
- last_pageblock_nr = pageblock_nr;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
- /*
- * For async migration, also only scan in MOVABLE
- * blocks. Async migration is optimistic to see if
- * the minimum amount of work satisfies the allocation
- */
- mt = get_pageblock_migratetype(page);
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(mt)) {
- set_unsuitable = false;
- goto next_pageblock;
- }
- }
+ /* Skip if free */
+ if (PageBuddy(page))
+ continue;
/*
- * Skip if free. page_order cannot be used without zone->lock
- * as nothing prevents parallel allocations or buddy merging.
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
*/
- if (PageBuddy(page))
- continue;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ !migrate_async_suitable(get_pageblock_migratetype(page))) {
+ cc->finished_update_migrate = true;
+ goto next_pageblock;
+ }
/*
* Check may be lockless but that's ok as we recheck later.
@@ -599,7 +539,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
- goto isolate_success;
+ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+ goto check_compact_cluster;
}
}
continue;
@@ -622,15 +566,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- /*
- * Migration will fail if an anonymous page is pinned in memory,
- * so avoid taking lru_lock and isolating it unnecessarily in an
- * admittedly racy check.
- */
- if (!page_mapping(page) &&
- page_count(page) > page_mapcount(page))
- continue;
-
/* Check if it is ok to still hold the lock */
locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
locked, cc);
@@ -645,23 +580,28 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
+ if (!cc->sync)
+ mode |= ISOLATE_ASYNC_MIGRATE;
+
+ if (unevictable)
+ mode |= ISOLATE_UNEVICTABLE;
+
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
if (__isolate_lru_page(page, mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec, page_lru(page));
-
-isolate_success:
cc->finished_update_migrate = true;
+ del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
+check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -671,7 +611,9 @@ isolate_success:
continue;
next_pageblock:
- low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
@@ -679,13 +621,9 @@ next_pageblock:
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- /*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
- */
+ /* Update the pageblock-skip if the whole pageblock was scanned */
if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated,
- set_unsuitable, true);
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -706,48 +644,37 @@ static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
- unsigned long block_start_pfn; /* start of current pageblock */
- unsigned long block_end_pfn; /* end of current pageblock */
- unsigned long low_pfn; /* lowest pfn scanner is able to scan */
+ unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
- * successfully isolated from, zone-cached value, or the end of the
- * zone when isolating for the first time. We need this aligned to
- * the pageblock boundary, because we do
- * block_start_pfn -= pageblock_nr_pages in the for loop.
- * For ending point, take care when isolating in last pageblock of a
- * a zone which ends in the middle of a pageblock.
- * The low boundary is the end of the pageblock the migration scanner
- * is using.
+ * scanned from (or the end of the zone if starting). The low point
+ * is the end of the pageblock the migration scanner is using.
*/
- block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
- block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
- zone_end_pfn(zone));
- low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+ pfn = cc->free_pfn;
+ low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+
+ /*
+ * Take care that if the migration scanner is at the end of the zone
+ * that the free scanner does not accidentally move to the next zone
+ * in the next isolation cycle.
+ */
+ high_pfn = min(low_pfn, pfn);
+
+ zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
/*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
- block_end_pfn = block_start_pfn,
- block_start_pfn -= pageblock_nr_pages) {
+ for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ pfn -= pageblock_nr_pages) {
unsigned long isolated;
- /*
- * This can iterate a massively long zone without finding any
- * suitable migration targets, so periodically check if we need
- * to schedule, or even abort async compaction.
- */
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
-
- if (!pfn_valid(block_start_pfn))
+ if (!pfn_valid(pfn))
continue;
/*
@@ -757,7 +684,7 @@ static void isolate_freepages(struct zone *zone,
* i.e. it's possible that all pages within a zones range of
* pages do not belong to a single zone.
*/
- page = pfn_to_page(block_start_pfn);
+ page = pfn_to_page(pfn);
if (page_zone(page) != zone)
continue;
@@ -770,38 +697,35 @@ static void isolate_freepages(struct zone *zone,
continue;
/* Found a block suitable for isolating free pages from */
- cc->free_pfn = block_start_pfn;
- isolated = isolate_freepages_block(cc, block_start_pfn,
- block_end_pfn, freelist, false);
- nr_freepages += isolated;
+ isolated = 0;
/*
- * Set a flag that we successfully isolated in this pageblock.
- * In the next loop iteration, zone->compact_cached_free_pfn
- * will not be updated and thus it will effectively contain the
- * highest pageblock we isolated pages from.
+ * As pfn may not start aligned, pfn+pageblock_nr_page
+ * may cross a MAX_ORDER_NR_PAGES boundary and miss
+ * a pfn_valid check. Ensure isolate_freepages_block()
+ * only scans within a pageblock
*/
- if (isolated)
- cc->finished_update_free = true;
+ end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ end_pfn = min(end_pfn, zone_end_pfn);
+ isolated = isolate_freepages_block(cc, pfn, end_pfn,
+ freelist, false);
+ nr_freepages += isolated;
/*
- * isolate_freepages_block() might have aborted due to async
- * compaction being contended
+ * Record the highest PFN we isolated pages from. When next
+ * looking for free pages, the search will restart here as
+ * page migration may have returned some pages to the allocator
*/
- if (cc->contended)
- break;
+ if (isolated) {
+ cc->finished_update_free = true;
+ high_pfn = max(high_pfn, pfn);
+ }
}
/* split_free_page does not map the pages */
map_pages(freelist);
- /*
- * If we crossed the migrate scanner, we want to keep it that way
- * so that compact_finished() may detect this
- */
- if (block_start_pfn < low_pfn)
- cc->free_pfn = cc->migrate_pfn;
-
+ cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
}
@@ -816,13 +740,9 @@ static struct page *compaction_alloc(struct page *migratepage,
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /*
- * Isolate free pages if necessary, and if we are not aborting due to
- * contention.
- */
+ /* Isolate free pages if necessary */
if (list_empty(&cc->freepages)) {
- if (!cc->contended)
- isolate_freepages(cc->zone, cc);
+ isolate_freepages(cc->zone, cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -836,16 +756,23 @@ static struct page *compaction_alloc(struct page *migratepage,
}
/*
- * This is a migrate-callback that "frees" freepages back to the isolated
- * freelist. All pages on the freelist are from the same zone, so there is no
- * special handling needed for NUMA.
+ * We cannot control nr_migratepages and nr_freepages fully when migration is
+ * running as migrate_pages() has no knowledge of compact_control. When
+ * migration is complete, we count the number of pages on the lists by hand.
*/
-static void compaction_free(struct page *page, unsigned long data)
+static void update_nr_listpages(struct compact_control *cc)
{
- struct compact_control *cc = (struct compact_control *)data;
+ int nr_migratepages = 0;
+ int nr_freepages = 0;
+ struct page *page;
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ nr_migratepages++;
+ list_for_each_entry(page, &cc->freepages, lru)
+ nr_freepages++;
- list_add(&page->lru, &cc->freepages);
- cc->nr_freepages++;
+ cc->nr_migratepages = nr_migratepages;
+ cc->nr_freepages = nr_freepages;
}
/* possible outcome of isolate_migratepages */
@@ -868,7 +795,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
/* Do not cross the free scanner or scan within a memory hole */
if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -892,16 +819,11 @@ static int compact_finished(struct zone *zone,
unsigned int order;
unsigned long watermark;
- if (cc->contended || fatal_signal_pending(current))
+ if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
- /* Let the next compaction start anew. */
- zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
- zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
-
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
@@ -998,8 +920,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- const bool sync = cc->mode != MIGRATE_ASYNC;
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
@@ -1013,19 +934,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
/*
- * Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
- */
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
- __reset_isolation_suitable(zone);
-
- /*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
*/
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn;
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -1033,15 +946,21 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ unsigned long nr_migrate, nr_remaining;
int err;
switch (isolate_migratepages(zone, cc)) {
@@ -1056,25 +975,22 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- if (!cc->nr_migratepages)
- continue;
-
+ nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
- compaction_free, (unsigned long)cc, cc->mode,
+ (unsigned long)cc, false,
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
MR_COMPACTION);
+ update_nr_listpages(cc);
+ nr_remaining = cc->nr_migratepages;
- trace_mm_compaction_migratepages(cc->nr_migratepages, err,
- &cc->migratepages);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
- /* All pages were either migrated or will be released */
- cc->nr_migratepages = 0;
+ /* Release isolated pages not migrated */
if (err) {
putback_movable_pages(&cc->migratepages);
- /*
- * migrate_pages() may return -ENOMEM when scanners meet
- * and we want compact_finished() to detect it
- */
- if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+ cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
goto out;
}
@@ -1086,13 +1002,12 @@ out:
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
- trace_mm_compaction_end(ret);
-
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone, int order,
- gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+static unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync, bool *contended)
{
unsigned long ret;
struct compact_control cc = {
@@ -1101,7 +1016,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
- .mode = mode,
+ .sync = sync,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1123,7 +1038,7 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
- * @mode: The migration mode for async, sync light, or sync migration
+ * @sync: Whether migration is synchronous or not
* @contended: Return value that is true if compaction was aborted due to lock contention
* @page: Optionally capture a free page of the requested order during compaction
*
@@ -1131,7 +1046,7 @@ int sysctl_extfrag_threshold = 500;
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, bool *contended)
+ bool sync, bool *contended)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1156,7 +1071,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, mode,
+ status = compact_zone_order(zone, order, gfp_mask, sync,
contended);
rc = max(status, rc);
@@ -1171,7 +1086,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact all zones within a node */
-static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
{
int zoneid;
struct zone *zone;
@@ -1192,38 +1107,40 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
compact_zone(zone, cc);
if (cc->order > 0) {
- if (zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0))
- compaction_defer_reset(zone, cc->order, false);
+ int ok = zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0);
+ if (ok && cc->order >= zone->compact_order_failed)
+ zone->compact_order_failed = cc->order + 1;
+ /* Currently async compaction is never deferred. */
+ else if (!ok && cc->sync)
+ defer_compaction(zone, cc->order);
}
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
}
+
+ return 0;
}
-void compact_pgdat(pg_data_t *pgdat, int order)
+int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
- .mode = MIGRATE_ASYNC,
+ .sync = false,
};
- if (!order)
- return;
-
- __compact_pgdat(pgdat, &cc);
+ return __compact_pgdat(pgdat, &cc);
}
-static void compact_node(int nid)
+static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
+ .sync = true,
};
- __compact_pgdat(NODE_DATA(nid), &cc);
+ return __compact_pgdat(NODE_DATA(nid), &cc);
}
/* Compact all nodes in the system */
@@ -1260,7 +1177,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
+ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 306baa594f9..c69781e97cf 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -170,16 +170,24 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->boundary = boundary;
retval->allocation = allocation;
- INIT_LIST_HEAD(&retval->pools);
+ if (dev) {
+ int ret;
- mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools) &&
- device_create_file(dev, &dev_attr_pools)) {
- kfree(retval);
- return NULL;
+ mutex_lock(&pools_lock);
+ if (list_empty(&dev->dma_pools))
+ ret = device_create_file(dev, &dev_attr_pools);
+ else
+ ret = 0;
+ /* note: not currently insisting "name" be unique */
+ if (!ret)
+ list_add(&retval->pools, &dev->dma_pools);
+ else {
+ kfree(retval);
+ retval = NULL;
+ }
+ mutex_unlock(&pools_lock);
} else
- list_add(&retval->pools, &dev->dma_pools);
- mutex_unlock(&pools_lock);
+ INIT_LIST_HEAD(&retval->pools);
return retval;
}
@@ -333,10 +341,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
continue;
if (pool->dev)
dev_err(pool->dev,
- "dma_pool_alloc %s, %p (corrupted)\n",
+ "dma_pool_alloc %s, %p (corruped)\n",
pool->name, retval);
else
- pr_err("dma_pool_alloc %s, %p (corrupted)\n",
+ pr_err("dma_pool_alloc %s, %p (corruped)\n",
pool->name, retval);
/*
@@ -500,6 +508,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
{
struct device *dev = pool->dev;
- WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
+ WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+ dma_pool_destroy(pool);
}
EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
deleted file mode 100644
index e10ccd299d6..00000000000
--- a/mm/early_ioremap.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Provide common bits of early_ioremap() support for architectures needing
- * temporary mappings during boot before ioremap() is available.
- *
- * This is mostly a direct copy of the x86 early_ioremap implementation.
- *
- * (C) Copyright 1995 1996, 2014 Linus Torvalds
- *
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <asm/fixmap.h>
-
-#ifdef CONFIG_MMU
-static int early_ioremap_debug __initdata;
-
-static int __init early_ioremap_debug_setup(char *str)
-{
- early_ioremap_debug = 1;
-
- return 0;
-}
-early_param("early_ioremap_debug", early_ioremap_debug_setup);
-
-static int after_paging_init __initdata;
-
-void __init __weak early_ioremap_shutdown(void)
-{
-}
-
-void __init early_ioremap_reset(void)
-{
- early_ioremap_shutdown();
- after_paging_init = 1;
-}
-
-/*
- * Generally, ioremap() is available after paging_init() has been called.
- * Architectures wanting to allow early_ioremap after paging_init() can
- * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
- */
-#ifndef __late_set_fixmap
-static inline void __init __late_set_fixmap(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t prot)
-{
- BUG();
-}
-#endif
-
-#ifndef __late_clear_fixmap
-static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
-{
- BUG();
-}
-#endif
-
-static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
-static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
-static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
-
-void __init early_ioremap_setup(void)
-{
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (WARN_ON(prev_map[i]))
- break;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
-}
-
-static int __init check_early_ioremap_leak(void)
-{
- int count = 0;
- int i;
-
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (prev_map[i])
- count++;
-
- if (WARN(count, KERN_WARNING
- "Debug warning: early ioremap leak of %d areas detected.\n"
- "please boot with early_ioremap_debug and report the dmesg.\n",
- count))
- return 1;
- return 0;
-}
-late_initcall(check_early_ioremap_leak);
-
-static void __init __iomem *
-__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
-{
- unsigned long offset;
- resource_size_t last_addr;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (!prev_map[i]) {
- slot = i;
- break;
- }
- }
-
- if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
- __func__, (u64)phys_addr, size))
- return NULL;
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (WARN_ON(!size || last_addr < phys_addr))
- return NULL;
-
- prev_size[slot] = size;
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
- /*
- * Mappings have to fit in the FIX_BTMAP area.
- */
- nrpages = size >> PAGE_SHIFT;
- if (WARN_ON(nrpages > NR_FIX_BTMAPS))
- return NULL;
-
- /*
- * Ok, go for it..
- */
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- if (after_paging_init)
- __late_set_fixmap(idx, phys_addr, prot);
- else
- __early_set_fixmap(idx, phys_addr, prot);
- phys_addr += PAGE_SIZE;
- --idx;
- --nrpages;
- }
- WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
- __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
-
- prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
- return prev_map[slot];
-}
-
-void __init early_iounmap(void __iomem *addr, unsigned long size)
-{
- unsigned long virt_addr;
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
- int i, slot;
-
- slot = -1;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
- if (prev_map[i] == addr) {
- slot = i;
- break;
- }
- }
-
- if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
- addr, size))
- return;
-
- if (WARN(prev_size[slot] != size,
- "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]))
- return;
-
- WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
- addr, size, slot);
-
- virt_addr = (unsigned long)addr;
- if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
- return;
-
- offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
-
- idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
- while (nrpages > 0) {
- if (after_paging_init)
- __late_clear_fixmap(idx);
- else
- __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
- --idx;
- --nrpages;
- }
- prev_map[slot] = NULL;
-}
-
-/* Remap an IO device */
-void __init __iomem *
-early_ioremap(resource_size_t phys_addr, unsigned long size)
-{
- return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
-}
-
-/* Remap memory */
-void __init *
-early_memremap(resource_size_t phys_addr, unsigned long size)
-{
- return (__force void *)__early_ioremap(phys_addr, size,
- FIXMAP_PAGE_NORMAL);
-}
-#else /* CONFIG_MMU */
-
-void __init __iomem *
-early_ioremap(resource_size_t phys_addr, unsigned long size)
-{
- return (__force void __iomem *)phys_addr;
-}
-
-/* Remap memory */
-void __init *
-early_memremap(resource_size_t phys_addr, unsigned long size)
-{
- return (void *)phys_addr;
-}
-
-void __init early_iounmap(void __iomem *addr, unsigned long size)
-{
-}
-
-#endif /* CONFIG_MMU */
-
-
-void __init early_memunmap(void *addr, unsigned long size)
-{
- early_iounmap((__force void __iomem *)addr, size);
-}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3bcfd81db45..a47f0f50c89 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,7 +17,6 @@
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
-#include <linux/swap.h>
#include <asm/unistd.h>
@@ -25,7 +24,7 @@
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
-SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
struct fd f = fdget(fd);
struct address_space *mapping;
@@ -39,7 +38,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
if (!f.file)
return -EBADF;
- if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
@@ -121,22 +120,9 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
- if (end_index >= start_index) {
- unsigned long count = invalidate_mapping_pages(mapping,
- start_index, end_index);
-
- /*
- * If fewer pages were invalidated than expected then
- * it is possible that some of the pages were on
- * a per-cpu pagevec for a remote CPU. Drain all
- * pagevecs and try again.
- */
- if (count < (end_index - start_index + 1)) {
- lru_add_drain_all();
- invalidate_mapping_pages(mapping, start_index,
+ if (end_index >= start_index)
+ invalidate_mapping_pages(mapping, start_index,
end_index);
- }
- }
break;
default:
ret = -EINVAL;
@@ -145,12 +131,26 @@ out:
fdput(f);
return ret;
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
+{
+ return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
+}
+SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
+#endif
#ifdef __ARCH_WANT_SYS_FADVISE64
-SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
+SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
{
return sys_fadvise64_64(fd, offset, len, advice);
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
+{
+ return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
+}
+SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
+#endif
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index dafb06f70a0..83efee76a5c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,12 +33,8 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
-#include <linux/rmap.h>
#include "internal.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/filemap.h>
-
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
@@ -77,7 +73,7 @@
* ->mmap_sem
* ->lock_page (access_process_vm)
*
- * ->i_mutex (generic_perform_write)
+ * ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
@@ -108,79 +104,15 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
-static void page_cache_tree_delete(struct address_space *mapping,
- struct page *page, void *shadow)
-{
- struct radix_tree_node *node;
- unsigned long index;
- unsigned int offset;
- unsigned int tag;
- void **slot;
-
- VM_BUG_ON(!PageLocked(page));
-
- __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
-
- if (shadow) {
- mapping->nrshadows++;
- /*
- * Make sure the nrshadows update is committed before
- * the nrpages update so that final truncate racing
- * with reclaim does not see both counters 0 at the
- * same time and miss a shadow entry.
- */
- smp_wmb();
- }
- mapping->nrpages--;
-
- if (!node) {
- /* Clear direct pointer tags in root node */
- mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
- radix_tree_replace_slot(slot, shadow);
- return;
- }
-
- /* Clear tree tags for the removed page */
- index = page->index;
- offset = index & RADIX_TREE_MAP_MASK;
- for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (test_bit(offset, node->tags[tag]))
- radix_tree_tag_clear(&mapping->page_tree, index, tag);
- }
-
- /* Delete page, swap shadow entry */
- radix_tree_replace_slot(slot, shadow);
- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
- return;
-
- /*
- * Track node that only contains shadow entries.
- *
- * Avoid acquiring the list_lru lock if already tracked. The
- * list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes, &node->private_list);
- }
-}
-
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
@@ -191,11 +123,10 @@ void __delete_from_page_cache(struct page *page, void *shadow)
else
cleancache_invalidate_page(mapping, page);
- page_cache_tree_delete(mapping, page, shadow);
-
+ radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
-
+ mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
@@ -231,7 +162,7 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(page, NULL);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -253,19 +184,6 @@ static int sleep_on_page_killable(void *word)
return fatal_signal_pending(current) ? -EINTR : 0;
}
-static int filemap_check_errors(struct address_space *mapping)
-{
- int ret = 0;
- /* Check for outstanding write errors */
- if (test_bit(AS_ENOSPC, &mapping->flags) &&
- test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_bit(AS_EIO, &mapping->flags) &&
- test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
- return ret;
-}
-
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -347,10 +265,10 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret2, ret = 0;
+ int ret = 0;
if (end_byte < start_byte)
- goto out;
+ return 0;
pagevec_init(&pvec, 0);
while ((index <= end) &&
@@ -373,10 +291,12 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
pagevec_release(&pvec);
cond_resched();
}
-out:
- ret2 = filemap_check_errors(mapping);
- if (!ret)
- ret = ret2;
+
+ /* Check for outstanding write errors */
+ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
return ret;
}
@@ -417,8 +337,6 @@ int filemap_write_and_wait(struct address_space *mapping)
if (!err)
err = err2;
}
- } else {
- err = filemap_check_errors(mapping);
}
return err;
}
@@ -450,8 +368,6 @@ int filemap_write_and_wait_range(struct address_space *mapping,
if (!err)
err = err2;
}
- } else {
- err = filemap_check_errors(mapping);
}
return err;
}
@@ -476,9 +392,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
int error;
- VM_BUG_ON_PAGE(!PageLocked(old), old);
- VM_BUG_ON_PAGE(!PageLocked(new), new);
- VM_BUG_ON_PAGE(new->mapping, new);
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (!error) {
@@ -493,7 +409,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->index = offset;
spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(old, NULL);
+ __delete_from_page_cache(old);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -513,91 +429,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int page_cache_tree_insert(struct address_space *mapping,
- struct page *page, void **shadowp)
-{
- struct radix_tree_node *node;
- void **slot;
- int error;
-
- error = __radix_tree_create(&mapping->page_tree, page->index,
- &node, &slot);
- if (error)
- return error;
- if (*slot) {
- void *p;
-
- p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
- if (!radix_tree_exceptional_entry(p))
- return -EEXIST;
- if (shadowp)
- *shadowp = p;
- mapping->nrshadows--;
- if (node)
- workingset_node_shadows_dec(node);
- }
- radix_tree_replace_slot(slot, page);
- mapping->nrpages++;
- if (node) {
- workingset_node_pages_inc(node);
- /*
- * Don't track node that contains actual pages.
- *
- * Avoid acquiring the list_lru lock if already
- * untracked. The list_empty() test is safe as
- * node->private_list is protected by
- * mapping->tree_lock.
- */
- if (!list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- }
- return 0;
-}
-
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
-{
- int error;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapBacked(page), page);
-
- error = mem_cgroup_charge_file(page, current->mm,
- gfp_mask & GFP_RECLAIM_MASK);
- if (error)
- return error;
-
- error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
- if (error) {
- mem_cgroup_uncharge_cache_page(page);
- return error;
- }
-
- page_cache_get(page);
- page->mapping = mapping;
- page->index = offset;
-
- spin_lock_irq(&mapping->tree_lock);
- error = page_cache_tree_insert(mapping, page, shadowp);
- radix_tree_preload_end();
- if (unlikely(error))
- goto err_insert;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- spin_unlock_irq(&mapping->tree_lock);
- trace_mm_filemap_add_to_page_cache(page);
- return 0;
-err_insert:
- page->mapping = NULL;
- /* Leave page->index set: truncation relies upon it */
- spin_unlock_irq(&mapping->tree_lock);
- mem_cgroup_uncharge_cache_page(page);
- page_cache_release(page);
- return error;
-}
-
/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
@@ -611,35 +442,51 @@ err_insert:
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- return __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, NULL);
+ int error;
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
+
+ error = mem_cgroup_cache_charge(page, current->mm,
+ gfp_mask & GFP_RECLAIM_MASK);
+ if (error)
+ goto out;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error == 0) {
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = radix_tree_insert(&mapping->page_tree, offset, page);
+ if (likely(!error)) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+ page_cache_release(page);
+ }
+ radix_tree_preload_end();
+ } else
+ mem_cgroup_uncharge_cache_page(page);
+out:
+ return error;
}
EXPORT_SYMBOL(add_to_page_cache_locked);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- void *shadow = NULL;
int ret;
- __set_page_locked(page);
- ret = __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, &shadow);
- if (unlikely(ret))
- __clear_page_locked(page);
- else {
- /*
- * The page might have been evicted from cache only
- * recently, in which case it should be activated like
- * any other repeatedly accessed page.
- */
- if (shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
- lru_cache_add(page);
- }
+ ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+ if (ret == 0)
+ lru_cache_add_file(page);
return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -653,10 +500,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
- cpuset_mems_cookie = read_mems_allowed_begin();
+ cpuset_mems_cookie = get_mems_allowed();
n = cpuset_mem_spread_node();
page = alloc_pages_exact_node(n, gfp, 0);
- } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
return page;
}
@@ -740,9 +587,9 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -753,51 +600,17 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- /*
- * TestClearPageReclaim could be used here but it is an atomic
- * operation and overkill in this particular case. Failing to
- * shuffle a page marked for immediate reclaim is too mild to
- * justify taking an atomic operation penalty at the end of
- * ever page writeback.
- */
- if (PageReclaim(page)) {
- ClearPageReclaim(page);
+ if (TestClearPageReclaim(page))
rotate_reclaimable_page(page);
- }
if (!test_clear_page_writeback(page))
BUG();
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
wake_up_page(page, PG_writeback);
}
EXPORT_SYMBOL(end_page_writeback);
-/*
- * After completing I/O on a page, call this routine to update the page
- * flags appropriately
- */
-void page_endio(struct page *page, int rw, int err)
-{
- if (rw == READ) {
- if (!err) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } else { /* rw == WRITE */
- if (err) {
- SetPageError(page);
- if (page->mapping)
- mapping_set_error(page->mapping, err);
- }
- end_page_writeback(page);
- }
-}
-EXPORT_SYMBOL_GPL(page_endio);
-
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
@@ -853,101 +666,14 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
}
/**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
- *
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
- */
-pgoff_t page_cache_next_hole(struct address_space *mapping,
- pgoff_t index, unsigned long max_scan)
-{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->page_tree, index);
- if (!page || radix_tree_exceptional_entry(page))
- break;
- index++;
- if (index == 0)
- break;
- }
-
- return index;
-}
-EXPORT_SYMBOL(page_cache_next_hole);
-
-/**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
- *
- * Search backwards in the range [max(index-max_scan+1, 0), index] for
- * the first hole.
- *
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >=
- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- * will be returned.
- *
- * page_cache_prev_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 10, then subsequently a hole is created at
- * index 5, page_cache_prev_hole covering both indexes may return 5 if
- * called under rcu_read_lock.
- */
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
- pgoff_t index, unsigned long max_scan)
-{
- unsigned long i;
-
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->page_tree, index);
- if (!page || radix_tree_exceptional_entry(page))
- break;
- index--;
- if (index == ULONG_MAX)
- break;
- }
-
- return index;
-}
-EXPORT_SYMBOL(page_cache_prev_hole);
-
-/**
- * find_get_entry - find and get a page cache entry
+ * find_get_page - find and get a page reference
* @mapping: the address_space to search
- * @offset: the page cache index
- *
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
+ * @offset: the page index
*
- * Otherwise, %NULL is returned.
+ * Is there a pagecache struct page at the given (mapping, offset) tuple?
+ * If yes, increment its refcount and return it; if no, return NULL.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
struct page *page;
@@ -964,9 +690,9 @@ repeat:
if (radix_tree_deref_retry(page))
goto repeat;
/*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
*/
goto out;
}
@@ -988,30 +714,24 @@ out:
return page;
}
-EXPORT_SYMBOL(find_get_entry);
+EXPORT_SYMBOL(find_get_page);
/**
- * find_lock_entry - locate, pin and lock a page cache entry
+ * find_lock_page - locate, pin and lock a pagecache page
* @mapping: the address_space to search
- * @offset: the page cache index
- *
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
+ * @offset: the page index
*
- * Otherwise, %NULL is returned.
+ * Locates the desired pagecache page, locks it, increments its reference
+ * count and returns its address.
*
- * find_lock_entry() may sleep.
+ * Returns zero if the page was not present. find_lock_page() may sleep.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_page(mapping, offset);
if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
@@ -1020,91 +740,48 @@ repeat:
page_cache_release(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON(page->index != offset);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
+EXPORT_SYMBOL(find_lock_page);
/**
- * pagecache_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- * @fgp_flags: PCG flags
- * @gfp_mask: gfp mask to use if a page is to be allocated
+ * find_or_create_page - locate or add a pagecache page
+ * @mapping: the page's address_space
+ * @index: the page's index into the mapping
+ * @gfp_mask: page allocation mode
*
- * Looks up the page cache slot at @mapping & @offset.
+ * Locates a page in the pagecache. If the page is not present, a new page
+ * is allocated using @gfp_mask and is added to the pagecache and to the VM's
+ * LRU list. The returned page is locked and has its reference count
+ * incremented.
*
- * PCG flags modify how the page is returned
+ * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
+ * allocation!
*
- * FGP_ACCESSED: the page will be marked accessed
- * FGP_LOCK: Page is return locked
- * FGP_CREAT: If page is not present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU
- * list. The page is returned locked and with an increased
- * refcount. Otherwise, %NULL is returned.
- *
- * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
- * if the GFP flags specified for FGP_CREAT are atomic.
- *
- * If there is a page cache page, it is returned with an increased refcount.
+ * find_or_create_page() returns the desired page's address, or zero on
+ * memory exhaustion.
*/
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
- int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
+struct page *find_or_create_page(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask)
{
struct page *page;
-
+ int err;
repeat:
- page = find_get_entry(mapping, offset);
- if (radix_tree_exceptional_entry(page))
- page = NULL;
- if (!page)
- goto no_page;
-
- if (fgp_flags & FGP_LOCK) {
- if (fgp_flags & FGP_NOWAIT) {
- if (!trylock_page(page)) {
- page_cache_release(page);
- return NULL;
- }
- } else {
- lock_page(page);
- }
-
- /* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
- VM_BUG_ON_PAGE(page->index != offset, page);
- }
-
- if (page && (fgp_flags & FGP_ACCESSED))
- mark_page_accessed(page);
-
-no_page:
- if (!page && (fgp_flags & FGP_CREAT)) {
- int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
- cache_gfp_mask |= __GFP_WRITE;
- if (fgp_flags & FGP_NOFS) {
- cache_gfp_mask &= ~__GFP_FS;
- radix_gfp_mask &= ~__GFP_FS;
- }
-
- page = __page_cache_alloc(cache_gfp_mask);
+ page = find_lock_page(mapping, index);
+ if (!page) {
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
-
- if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
- fgp_flags |= FGP_LOCK;
-
- /* Init accessed so avoit atomic mark_page_accessed later */
- if (fgp_flags & FGP_ACCESSED)
- init_page_accessed(page);
-
- err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ /*
+ * We want a regular kernel memory (not highmem or DMA etc)
+ * allocation for the radix tree nodes, but we need to honour
+ * the context-specific requirements the caller has asked for.
+ * GFP_RECLAIM_MASK collects those requirements.
+ */
+ err = add_to_page_cache_lru(page, mapping, index,
+ (gfp_mask & GFP_RECLAIM_MASK));
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -1112,80 +789,9 @@ no_page:
goto repeat;
}
}
-
return page;
}
-EXPORT_SYMBOL(pagecache_get_page);
-
-/**
- * find_get_entries - gang pagecache lookup
- * @mapping: The address_space to search
- * @start: The starting page cache index
- * @nr_entries: The maximum number of entries
- * @entries: Where the resulting entries are placed
- * @indices: The cache indices corresponding to the entries in @entries
- *
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping. The entries are placed at
- * @entries. find_get_entries() takes a reference against any actual
- * pages it returns.
- *
- * The search returns a group of mapping-contiguous page cache entries
- * with ascending indexes. There may be holes in the indices due to
- * not-present pages.
- *
- * Any shadow entries of evicted pages, or swap entries from
- * shmem/tmpfs, are included in the returned array.
- *
- * find_get_entries() returns the number of pages and shadow entries
- * which were found.
- */
-unsigned find_get_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
-{
- void **slot;
- unsigned int ret = 0;
- struct radix_tree_iter iter;
-
- if (!nr_entries)
- return 0;
-
- rcu_read_lock();
-restart:
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
- */
- goto export;
- }
- if (!page_cache_get_speculative(page))
- goto repeat;
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- page_cache_release(page);
- goto repeat;
- }
-export:
- indices[ret] = iter.index;
- entries[ret] = page;
- if (++ret == nr_entries)
- break;
- }
- rcu_read_unlock();
- return ret;
-}
+EXPORT_SYMBOL(find_or_create_page);
/**
* find_get_pages - gang pagecache lookup
@@ -1233,9 +839,9 @@ repeat:
goto restart;
}
/*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Skip
- * over it.
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
*/
continue;
}
@@ -1300,9 +906,9 @@ repeat:
goto restart;
}
/*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Stop
- * looking for contiguous pages.
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so stop looking for
+ * contiguous pages.
*/
break;
}
@@ -1376,17 +982,10 @@ repeat:
goto restart;
}
/*
- * A shadow entry of a recently evicted page.
- *
- * Those entries should never be tagged, but
- * this tree walk is lockless and the tags are
- * looked up in bulk, one radix tree node at a
- * time, so there is a sizable window for page
- * reclaim to evict a page we saw tagged.
- *
- * Skip over it.
+ * This function is never used on a shmem/tmpfs
+ * mapping, so a swap entry won't be found here.
*/
- continue;
+ BUG();
}
if (!page_cache_get_speculative(page))
@@ -1412,6 +1011,39 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
+ * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * This is intended for speculative data generators, where the data can
+ * be regenerated if the page couldn't be grabbed. This routine should
+ * be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
+ */
+struct page *
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = find_get_page(mapping, index);
+
+ if (page) {
+ if (trylock_page(page))
+ return page;
+ page_cache_release(page);
+ return NULL;
+ }
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_nowait);
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1437,8 +1069,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
- * @iter: data destination
- * @written: already copied
+ * @desc: read_descriptor
+ * @actor: read method
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1446,8 +1078,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
- struct iov_iter *iter, ssize_t written)
+static void do_generic_file_read(struct file *filp, loff_t *ppos,
+ read_descriptor_t *desc, read_actor_t actor)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
@@ -1457,12 +1089,12 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
- int error = 0;
+ int error;
index = *ppos >> PAGE_CACHE_SHIFT;
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
for (;;) {
@@ -1497,7 +1129,7 @@ find_page:
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
- offset, iter->count))
+ desc, offset))
goto page_not_up_to_date_locked;
unlock_page(page);
}
@@ -1547,23 +1179,23 @@ page_ok:
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
*/
-
- ret = copy_page_to_iter(page, offset, nr, iter);
+ ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page);
- written += ret;
- if (!iov_iter_count(iter))
- goto out;
- if (ret < nr) {
- error = -EFAULT;
- goto out;
- }
- continue;
+ if (ret == nr && desc->count)
+ continue;
+ goto out;
page_not_up_to_date:
/* Get exclusive access to the page ... */
@@ -1598,7 +1230,6 @@ readpage:
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
- error = 0;
goto find_page;
}
goto readpage_error;
@@ -1629,6 +1260,7 @@ readpage:
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
+ desc->error = error;
page_cache_release(page);
goto out;
@@ -1639,17 +1271,16 @@ no_cached_page:
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
- error = -ENOMEM;
+ desc->error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
- if (error == -EEXIST) {
- error = 0;
+ if (error == -EEXIST)
goto find_page;
- }
+ desc->error = error;
goto out;
}
goto readpage;
@@ -1662,66 +1293,185 @@ out:
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
- return written ? written : error;
}
+int file_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ char *kaddr;
+ unsigned long left, count = desc->count;
+
+ if (size > count)
+ size = count;
+
+ /*
+ * Faults on the destination of a read are common, so do it before
+ * taking the kmap.
+ */
+ if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+ kaddr = kmap_atomic(page);
+ left = __copy_to_user_inatomic(desc->arg.buf,
+ kaddr + offset, size);
+ kunmap_atomic(kaddr);
+ if (left == 0)
+ goto success;
+ }
+
+ /* Do it the slow way */
+ kaddr = kmap(page);
+ left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+ kunmap(page);
+
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+success:
+ desc->count = count - size;
+ desc->written += size;
+ desc->arg.buf += size;
+ return size;
+}
+
+/*
+ * Performs necessary checks before doing a write
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @count: number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+ unsigned long *nr_segs, size_t *count, int access_flags)
+{
+ unsigned long seg;
+ size_t cnt = 0;
+ for (seg = 0; seg < *nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ cnt += iv->iov_len;
+ if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ *nr_segs = seg;
+ cnt -= iv->iov_len; /* This segment is no good */
+ break;
+ }
+ *count = cnt;
+ return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
+
/**
- * generic_file_read_iter - generic filesystem read routine
+ * generic_file_aio_read - generic filesystem read routine
* @iocb: kernel I/O control block
- * @iter: destination for the data read
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @pos: current file position
*
- * This is the "read_iter()" routine for all filesystems
+ * This is the "read()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t
-generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- struct file *file = iocb->ki_filp;
- ssize_t retval = 0;
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg = 0;
+ size_t count;
loff_t *ppos = &iocb->ki_pos;
- loff_t pos = *ppos;
+
+ count = 0;
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (file->f_flags & O_DIRECT) {
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
+ if (filp->f_flags & O_DIRECT) {
loff_t size;
+ struct address_space *mapping;
+ struct inode *inode;
+ mapping = filp->f_mapping;
+ inode = mapping->host;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
- retval = filemap_write_and_wait_range(mapping, pos,
- pos + count - 1);
- if (!retval) {
- struct iov_iter data = *iter;
- retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
- }
+ if (pos < size) {
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
+ if (retval > 0) {
+ *ppos = pos + retval;
+ count -= retval;
+ }
- if (retval > 0) {
- *ppos = pos + retval;
- iov_iter_advance(iter, retval);
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
+ file_accessed(filp);
+ goto out;
+ }
}
+ }
+
+ count = retval;
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
+ loff_t offset = 0;
/*
- * Btrfs can have a short DIO read if we encounter
- * compressed extents, so if there was an error, or if
- * we've already read everything we wanted to, or if
- * there was a short read because we hit EOF, go ahead
- * and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
+ * If we did a short DIO read we need to skip the section of the
+ * iov that we've already read data into.
*/
- if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
- file_accessed(file);
- goto out;
+ if (count) {
+ if (count > iov[seg].iov_len) {
+ count -= iov[seg].iov_len;
+ continue;
+ }
+ offset = count;
+ count = 0;
}
- }
- retval = do_generic_file_read(file, ppos, iter, retval);
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base + offset;
+ desc.count = iov[seg].iov_len - offset;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
out:
return retval;
}
-EXPORT_SYMBOL(generic_file_read_iter);
+EXPORT_SYMBOL(generic_file_aio_read);
#ifdef CONFIG_MMU
/**
@@ -1771,12 +1521,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
+ if (VM_RandomReadHint(vma))
return;
if (!ra->ra_pages)
return;
- if (vma->vm_flags & VM_SEQ_READ) {
+ if (VM_SequentialReadHint(vma)) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
@@ -1816,7 +1566,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
+ if (VM_RandomReadHint(vma))
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
@@ -1846,11 +1596,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
- loff_t size;
+ pgoff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (offset >= size >> PAGE_CACHE_SHIFT)
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (offset >= size)
return VM_FAULT_SIGBUS;
/*
@@ -1886,7 +1636,7 @@ retry_find:
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON(page->index != offset);
/*
* We have a locked page in the page cache, now we need to check
@@ -1899,8 +1649,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(offset >= size)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
@@ -1958,82 +1708,10 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct radix_tree_iter iter;
- void **slot;
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- loff_t size;
- struct page *page;
- unsigned long address = (unsigned long) vmf->virtual_address;
- unsigned long addr;
- pte_t *pte;
-
- rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
- if (iter.index > vmf->max_pgoff)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- goto next;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
- }
-
- if (!page_cache_get_speculative(page))
- goto repeat;
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- page_cache_release(page);
- goto repeat;
- }
-
- if (!PageUptodate(page) ||
- PageReadahead(page) ||
- PageHWPoison(page))
- goto skip;
- if (!trylock_page(page))
- goto skip;
-
- if (page->mapping != mapping || !PageUptodate(page))
- goto unlock;
-
- size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
- if (page->index >= size >> PAGE_CACHE_SHIFT)
- goto unlock;
-
- pte = vmf->pte + page->index - vmf->pgoff;
- if (!pte_none(*pte))
- goto unlock;
-
- if (file->f_ra.mmap_miss > 0)
- file->f_ra.mmap_miss--;
- addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
- do_set_pte(vma, addr, page, pte, false, false);
- unlock_page(page);
- goto next;
-unlock:
- unlock_page(page);
-skip:
- page_cache_release(page);
-next:
- if (iter.index == vmf->max_pgoff)
- break;
- }
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(filemap_map_pages);
-
int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int ret = VM_FAULT_LOCKED;
sb_start_pagefault(inode->i_sb);
@@ -2050,7 +1728,6 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* see the dirty page and writeprotect it again.
*/
set_page_dirty(page);
- wait_for_stable_page(page);
out:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -2059,7 +1736,6 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
- .map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -2100,18 +1776,6 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
-static struct page *wait_on_page_read(struct page *page)
-{
- if (!IS_ERR(page)) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
@@ -2138,8 +1802,6 @@ repeat:
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
}
}
return page;
@@ -2176,10 +1838,6 @@ retry:
if (err < 0) {
page_cache_release(page);
return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
}
out:
mark_page_accessed(page);
@@ -2187,25 +1845,40 @@ out:
}
/**
- * read_cache_page - read into page cache, fill it if needed
+ * read_cache_page_async - read into page cache, fill it if needed
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
* @data: first arg to filler(data, page) function, often left as NULL
*
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
* Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page and wait for it to become unlocked.
+ * not set, try to fill the page but don't wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
*/
-struct page *read_cache_page(struct address_space *mapping,
+struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
}
-EXPORT_SYMBOL(read_cache_page);
+EXPORT_SYMBOL(read_cache_page_async);
+
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
/**
* read_cache_page_gfp - read into page cache, using specified page allocation flags.
@@ -2224,10 +1897,175 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
{
filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- return do_read_cache_page(mapping, index, filler, NULL, gfp);
+ return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
}
EXPORT_SYMBOL(read_cache_page_gfp);
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page then wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
+{
+ return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
+}
+EXPORT_SYMBOL(read_cache_page);
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
/*
* Performs necessary checks before doing a write
*
@@ -2327,12 +2165,15 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
{
const struct address_space_operations *aops = mapping->a_ops;
+ mark_page_accessed(page);
return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
}
EXPORT_SYMBOL(pagecache_write_end);
ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -2340,9 +2181,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
ssize_t written;
size_t write_len;
pgoff_t end;
- struct iov_iter data;
- write_len = iov_iter_count(from);
+ if (count != ocount)
+ *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+ write_len = iov_length(iov, *nr_segs);
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2369,8 +2212,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
}
}
- data = *from;
- written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+ written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2387,12 +2229,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
if (written > 0) {
pos += written;
- iov_iter_advance(from, written);
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- iocb->ki_pos = pos;
+ *ppos = pos;
}
out:
return written;
@@ -2406,23 +2247,39 @@ EXPORT_SYMBOL(generic_file_direct_write);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
+ int status;
+ gfp_t gfp_mask;
struct page *page;
- int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+ gfp_t gfp_notmask = 0;
+ gfp_mask = mapping_gfp_mask(mapping);
+ if (mapping_cap_account_dirty(mapping))
+ gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
-
- page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping),
- GFP_KERNEL);
+ gfp_notmask = __GFP_FS;
+repeat:
+ page = find_lock_page(mapping, index);
if (page)
- wait_for_stable_page(page);
+ goto found;
+ page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
+ if (!page)
+ return NULL;
+ status = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & ~gfp_notmask);
+ if (unlikely(status)) {
+ page_cache_release(page);
+ if (status == -EEXIST)
+ goto repeat;
+ return NULL;
+ }
+found:
+ wait_on_page_writeback(page);
return page;
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-ssize_t generic_perform_write(struct file *file,
+static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
@@ -2466,15 +2323,18 @@ again:
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
- if (unlikely(status < 0))
+ if (unlikely(status))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
+ pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
flush_dcache_page(page);
+ mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
@@ -2509,12 +2369,34 @@ again:
return written ? written : status;
}
-EXPORT_SYMBOL(generic_perform_write);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t status;
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ status = generic_perform_write(file, &i, pos);
+
+ if (likely(status >= 0)) {
+ written += status;
+ *ppos = pos + status;
+ }
+
+ return written ? written : status;
+}
+EXPORT_SYMBOL(generic_file_buffered_write);
/**
- * __generic_file_write_iter - write data to a file
+ * __generic_file_aio_write - write data to a file
* @iocb: IO state structure (file, offset, etc.)
- * @from: iov_iter with data to write
+ * @iov: vector with data to write
+ * @nr_segs: number of segments in the vector
+ * @ppos: position where to write
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -2528,19 +2410,30 @@ EXPORT_SYMBOL(generic_perform_write);
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
*/
-ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
+ loff_t pos;
+ ssize_t written;
ssize_t err;
- ssize_t status;
- size_t count = iov_iter_count(from);
+
+ ocount = 0;
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ return err;
+
+ count = ocount;
+ pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2548,8 +2441,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (count == 0)
goto out;
- iov_iter_truncate(from, count);
-
err = file_remove_suid(file);
if (err)
goto out;
@@ -2561,40 +2452,42 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
+ ssize_t written_buffered;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
if (written < 0 || written == count)
goto out;
-
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
-
- status = generic_perform_write(file, from, pos);
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
/*
- * If generic_perform_write() returned a synchronous error
+ * If generic_file_buffered_write() retuned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
- if (unlikely(status < 0) && !written) {
- err = status;
+ if (written_buffered < 0) {
+ err = written_buffered;
goto out;
}
- iocb->ki_pos = pos + status;
+
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
- endbyte = pos + status - 1;
+ endbyte = pos + written_buffered - written - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
if (err == 0) {
- written += status;
+ written = written_buffered;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
@@ -2605,45 +2498,51 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
*/
}
} else {
- written = generic_perform_write(file, from, pos);
- if (likely(written >= 0))
- iocb->ki_pos = pos + written;
+ written = generic_file_buffered_write(iocb, iov, nr_segs,
+ pos, ppos, count, written);
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
-EXPORT_SYMBOL(__generic_file_write_iter);
+EXPORT_SYMBOL(__generic_file_aio_write);
/**
- * generic_file_write_iter - write data to a file
+ * generic_file_aio_write - write data to a file
* @iocb: IO state structure
- * @from: iov_iter with data to write
+ * @iov: vector with data to write
+ * @nr_segs: number of segments in the vector
+ * @pos: position in file where to write
*
- * This is a wrapper around __generic_file_write_iter() to be used by most
+ * This is a wrapper around __generic_file_aio_write() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
*/
-ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
+ BUG_ON(iocb->ki_pos != pos);
+
+ sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_iter(iocb, from);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0) {
+ if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
- err = generic_write_sync(file, iocb->ki_pos - ret, ret);
- if (err < 0)
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
ret = err;
}
+ sb_end_write(inode->i_sb);
return ret;
}
-EXPORT_SYMBOL(generic_file_write_iter);
+EXPORT_SYMBOL(generic_file_aio_write);
/**
* try_to_release_page() - release old fs-specific metadata on a page
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3f685..a912da6ddfd 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -26,7 +26,7 @@
* of ZERO_PAGE(), such as /dev/zero
*/
static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
static struct page *__xip_sparse_page;
/* called under xip_sparse_mutex */
@@ -404,6 +404,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
loff_t pos;
ssize_t ret;
+ sb_start_write(inode->i_sb);
+
mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
@@ -437,6 +439,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
current->backing_dev_info = NULL;
out_up:
mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
return ret;
}
EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/fremap.c b/mm/fremap.c
index 72b8fa36143..a0aaf0e5680 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -23,44 +23,28 @@
#include "internal.h"
-static int mm_counter(struct page *page)
-{
- return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
- struct page *page;
- swp_entry_t entry;
if (pte_present(pte)) {
+ struct page *page;
+
flush_cache_page(vma, addr, pte_pfn(pte));
pte = ptep_clear_flush(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- update_hiwater_rss(mm);
- dec_mm_counter(mm, mm_counter(page));
page_remove_rmap(page);
page_cache_release(page);
- }
- } else { /* zap_pte() is not called when pte_none() */
- if (!pte_file(pte)) {
update_hiwater_rss(mm);
- entry = pte_to_swp_entry(pte);
- if (non_swap_entry(entry)) {
- if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
- dec_mm_counter(mm, mm_counter(page));
- }
- } else {
- free_swap_and_cache(entry);
- dec_mm_counter(mm, MM_SWAPENTS);
- }
+ dec_mm_counter(mm, MM_FILEPAGES);
}
+ } else {
+ if (!pte_file(pte))
+ free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear_not_present_full(mm, addr, ptep, 0);
}
}
@@ -73,19 +57,17 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long pgoff, pgprot_t prot)
{
int err = -ENOMEM;
- pte_t *pte, ptfile;
+ pte_t *pte;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
- ptfile = pgoff_to_pte(pgoff);
-
if (!pte_none(*pte))
zap_pte(mm, vma, addr, pte);
- set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
+ set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
@@ -147,11 +129,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
- vm_flags_t vm_flags = 0;
-
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
if (prot)
return err;
@@ -183,11 +160,15 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
- * the single existing vma.
+ * the single existing vma. vm_private_data is used as a
+ * swapout cursor in a VM_NONLINEAR vma.
*/
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
+ if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
+ goto out;
+
if (!vma->vm_ops || !vma->vm_ops->remap_pages)
goto out;
@@ -196,13 +177,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
/* Must set VM_NONLINEAR before any pages are populated. */
if (!(vma->vm_flags & VM_NONLINEAR)) {
- /*
- * vm_private_data is used as a swapout cursor
- * in a VM_NONLINEAR vma.
- */
- if (vma->vm_private_data)
- goto out;
-
/* Don't need a nonlinear mapping, exit success */
if (pgoff == linear_page_index(vma, start)) {
err = 0;
@@ -210,7 +184,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
if (!has_write_lock) {
-get_write_lock:
up_read(&mm->mmap_sem);
down_write(&mm->mmap_sem);
has_write_lock = 1;
@@ -225,10 +198,10 @@ get_write_lock:
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
struct file *file = get_file(vma->vm_file);
- /* mmap_region may free vma; grab the info now */
- vm_flags = vma->vm_flags;
- addr = mmap_region(file, start, size, vm_flags, pgoff);
+ flags &= MAP_NONBLOCK;
+ addr = mmap_region(file, start, size,
+ flags, vma->vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
@@ -236,7 +209,7 @@ get_write_lock:
BUG_ON(addr != start);
err = 0;
}
- goto out_freed;
+ goto out;
}
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
@@ -251,16 +224,28 @@ get_write_lock:
/*
* drop PG_Mlocked flag for over-mapped range
*/
- if (!has_write_lock)
- goto get_write_lock;
- vm_flags = vma->vm_flags;
+ vm_flags_t saved_flags = vma->vm_flags;
munlock_vma_pages_range(vma, start, start + size);
- vma->vm_flags = vm_flags;
+ vma->vm_flags = saved_flags;
}
mmu_notifier_invalidate_range_start(mm, start, start + size);
err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
mmu_notifier_invalidate_range_end(mm, start, start + size);
+ if (!err && !(flags & MAP_NONBLOCK)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ /*
+ * might be mapping previously unmapped range of file
+ */
+ mlock_vma_pages_range(vma, start, start + size);
+ } else {
+ if (unlikely(has_write_lock)) {
+ downgrade_write(&mm->mmap_sem);
+ has_write_lock = 0;
+ }
+ make_pages_present(start, start+size);
+ }
+ }
/*
* We can't clear VM_NONLINEAR because we'd have to do
@@ -269,15 +254,10 @@ get_write_lock:
*/
out:
- if (vma)
- vm_flags = vma->vm_flags;
-out_freed:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
up_write(&mm->mmap_sem);
- if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
- mm_populate(start, size);
return err;
}
diff --git a/mm/frontswap.c b/mm/frontswap.c
index c30eec536f0..2890e67d602 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -24,7 +24,15 @@
* frontswap_ops is set by frontswap_register_ops to contain the pointers
* to the frontswap "backend" implementation functions.
*/
-static struct frontswap_ops *frontswap_ops __read_mostly;
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
/*
* If enabled, frontswap_store will return failure even on success. As
@@ -72,70 +80,16 @@ static inline void inc_frontswap_succ_stores(void) { }
static inline void inc_frontswap_failed_stores(void) { }
static inline void inc_frontswap_invalidates(void) { }
#endif
-
-/*
- * Due to the asynchronous nature of the backends loading potentially
- * _after_ the swap system has been activated, we have chokepoints
- * on all frontswap functions to not call the backend until the backend
- * has registered.
- *
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
- * This would not guards us against the user deciding to call swapoff right as
- * we are calling the backend to initialize (so swapon is in action).
- * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
- * OK. The other scenario where calls to frontswap_store (called via
- * swap_writepage) is racing with frontswap_invalidate_area (called via
- * swapoff) is again guarded by the swap subsystem.
- *
- * While no backend is registered all calls to frontswap_[store|load|
- * invalidate_area|invalidate_page] are ignored or fail.
- *
- * The time between the backend being registered and the swap file system
- * calling the backend (via the frontswap_* functions) is indeterminate as
- * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
- * That is OK as we are comfortable missing some of these calls to the newly
- * registered backend.
- *
- * Obviously the opposite (unloading the backend) must be done after all
- * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
- */
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
-
/*
* Register operations for frontswap, returning previous thus allowing
* detection of multiple backends and possible nesting.
*/
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
{
- struct frontswap_ops *old = frontswap_ops;
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (test_and_clear_bit(i, need_init)) {
- struct swap_info_struct *sis = swap_info[i];
- /* __frontswap_init _should_ have set it! */
- if (!sis->frontswap_map)
- return ERR_PTR(-EINVAL);
- ops->init(i);
- }
- }
- /*
- * We MUST have frontswap_ops set _after_ the frontswap_init's
- * have been called. Otherwise __frontswap_store might fail. Hence
- * the barrier to make sure compiler does not re-order us.
- */
- barrier();
- frontswap_ops = ops;
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
return old;
}
EXPORT_SYMBOL(frontswap_register_ops);
@@ -161,48 +115,20 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
/*
* Called when a swap device is swapon'd.
*/
-void __frontswap_init(unsigned type, unsigned long *map)
+void __frontswap_init(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
BUG_ON(sis == NULL);
-
- /*
- * p->frontswap is a bitmap that we MUST have to figure out which page
- * has gone in frontswap. Without it there is no point of continuing.
- */
- if (WARN_ON(!map))
+ if (sis->frontswap_map == NULL)
return;
- /*
- * Irregardless of whether the frontswap backend has been loaded
- * before this function or it will be later, we _MUST_ have the
- * p->frontswap set to something valid to work properly.
- */
- frontswap_map_set(sis, map);
- if (frontswap_ops)
- frontswap_ops->init(type);
- else {
- BUG_ON(type > MAX_SWAPFILES);
- set_bit(type, need_init);
- }
+ frontswap_ops.init(type);
}
EXPORT_SYMBOL(__frontswap_init);
-bool __frontswap_test(struct swap_info_struct *sis,
- pgoff_t offset)
-{
- bool ret = false;
-
- if (frontswap_ops && sis->frontswap_map)
- ret = test_bit(offset, sis->frontswap_map);
- return ret;
-}
-EXPORT_SYMBOL(__frontswap_test);
-
-static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
{
- clear_bit(offset, sis->frontswap_map);
+ frontswap_clear(sis, offset);
atomic_dec(&sis->frontswap_pages);
}
@@ -221,20 +147,13 @@ int __frontswap_store(struct page *page)
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
- /*
- * Return if no backend registed.
- * Don't need to inc frontswap_failed_stores here.
- */
- if (!frontswap_ops)
- return ret;
-
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- if (__frontswap_test(sis, offset))
+ if (frontswap_test(sis, offset))
dup = 1;
- ret = frontswap_ops->store(type, offset, page);
+ ret = frontswap_ops.store(type, offset, page);
if (ret == 0) {
- set_bit(offset, sis->frontswap_map);
+ frontswap_set(sis, offset);
inc_frontswap_succ_stores();
if (!dup)
atomic_inc(&sis->frontswap_pages);
@@ -269,16 +188,13 @@ int __frontswap_load(struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset))
- ret = frontswap_ops->load(type, offset, page);
+ if (frontswap_test(sis, offset))
+ ret = frontswap_ops.load(type, offset, page);
if (ret == 0) {
inc_frontswap_loads();
if (frontswap_tmem_exclusive_gets_enabled) {
SetPageDirty(page);
- __frontswap_clear(sis, offset);
+ frontswap_clear(sis, offset);
}
}
return ret;
@@ -294,11 +210,8 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
struct swap_info_struct *sis = swap_info[type];
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset)) {
- frontswap_ops->invalidate_page(type, offset);
+ if (frontswap_test(sis, offset)) {
+ frontswap_ops.invalidate_page(type, offset);
__frontswap_clear(sis, offset);
inc_frontswap_invalidates();
}
@@ -313,26 +226,26 @@ void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
- if (frontswap_ops) {
- BUG_ON(sis == NULL);
- if (sis->frontswap_map == NULL)
- return;
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
- }
- clear_bit(type, need_init);
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
}
EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
+ int type;
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list)
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
totalpages += atomic_read(&si->frontswap_pages);
+ }
return totalpages;
}
@@ -344,9 +257,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
@@ -361,7 +276,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
- *swapid = si->type;
+ *swapid = type;
ret = 0;
break;
}
@@ -408,7 +323,7 @@ void frontswap_shrink(unsigned long target_pages)
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_active_head each time
+ * so restart scan from swap_list.head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/gup.c b/mm/gup.c
deleted file mode 100644
index cc5a9e7adea..00000000000
--- a/mm/gup.c
+++ /dev/null
@@ -1,662 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-
-#include <linux/hugetlb.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/rmap.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-
-#include "internal.h"
-
-static struct page *no_page_table(struct vm_area_struct *vma,
- unsigned int flags)
-{
- /*
- * When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate unnecessary pages or
- * page tables. Return error instead of NULL to skip handle_mm_fault,
- * then get_dump_page() will return NULL to leave a hole in the dump.
- * But we can only make this optimization where a hole would surely
- * be zero-filled if handle_mm_fault() actually did handle it.
- */
- if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
- return ERR_PTR(-EFAULT);
- return NULL;
-}
-
-static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- spinlock_t *ptl;
- pte_t *ptep, pte;
-
-retry:
- if (unlikely(pmd_bad(*pmd)))
- return no_page_table(vma, flags);
-
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte) || pte_file(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto retry;
- }
- if ((flags & FOLL_NUMA) && pte_numa(pte))
- goto no_page;
- if ((flags & FOLL_WRITE) && !pte_write(pte)) {
- pte_unmap_unlock(ptep, ptl);
- return NULL;
- }
-
- page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
- if ((flags & FOLL_DUMP) ||
- !is_zero_pfn(pte_pfn(pte)))
- goto bad_page;
- page = pte_page(pte);
- }
-
- if (flags & FOLL_GET)
- get_page_foll(page);
- if (flags & FOLL_TOUCH) {
- if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
- /*
- * pte_mkyoung() would be more correct here, but atomic care
- * is needed to avoid losing the dirty bit: it is easier to use
- * mark_page_accessed().
- */
- mark_page_accessed(page);
- }
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
- pte_unmap_unlock(ptep, ptl);
- return page;
-bad_page:
- pte_unmap_unlock(ptep, ptl);
- return ERR_PTR(-EFAULT);
-
-no_page:
- pte_unmap_unlock(ptep, ptl);
- if (!pte_none(pte))
- return NULL;
- return no_page_table(vma, flags);
-}
-
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- spinlock_t *ptl;
- struct page *page;
- struct mm_struct *mm = vma->vm_mm;
-
- *page_mask = 0;
-
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- return page;
- }
-
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return no_page_table(vma, flags);
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud))
- return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- return NULL;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- return page;
- }
- if (unlikely(pud_bad(*pud)))
- return no_page_table(vma, flags);
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else
- page = NULL;
- }
- return page;
- }
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
- return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
- spin_unlock(ptl);
- }
- return follow_page_pte(vma, address, pmd, flags);
-}
-
-static int get_gate_page(struct mm_struct *mm, unsigned long address,
- unsigned int gup_flags, struct vm_area_struct **vma,
- struct page **page)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int ret = -EFAULT;
-
- /* user gate pages are read-only */
- if (gup_flags & FOLL_WRITE)
- return -EFAULT;
- if (address > TASK_SIZE)
- pgd = pgd_offset_k(address);
- else
- pgd = pgd_offset_gate(mm, address);
- BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, address);
- BUG_ON(pud_none(*pud));
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return -EFAULT;
- VM_BUG_ON(pmd_trans_huge(*pmd));
- pte = pte_offset_map(pmd, address);
- if (pte_none(*pte))
- goto unmap;
- *vma = get_gate_vma(mm);
- if (!page)
- goto out;
- *page = vm_normal_page(*vma, address, *pte);
- if (!*page) {
- if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
- goto unmap;
- *page = pte_page(*pte);
- }
- get_page(*page);
-out:
- ret = 0;
-unmap:
- pte_unmap(pte);
- return ret;
-}
-
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *nonblocking)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned int fault_flags = 0;
- int ret;
-
- /* For mlock, just skip the stack guard page. */
- if ((*flags & FOLL_MLOCK) &&
- (stack_guard_page_start(vma, address) ||
- stack_guard_page_end(vma, address + PAGE_SIZE)))
- return -ENOENT;
- if (*flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
- if (*flags & FOLL_NOWAIT)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
-
- ret = handle_mm_fault(mm, vma, address, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
- if (ret & VM_FAULT_SIGBUS)
- return -EFAULT;
- BUG();
- }
-
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
- if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
- *nonblocking = 0;
- return -EBUSY;
- }
-
- /*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
- * necessary, even if maybe_mkwrite decided not to set pte_write. We
- * can thus safely do subsequent page lookups as if they were reads.
- * But only do so when looping for pte_write is futile: in some cases
- * userspace may also be wanting to write to the gotten user page,
- * which a read fault here might prevent (a readonly page might get
- * reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags &= ~FOLL_WRITE;
- return 0;
-}
-
-static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
-{
- vm_flags_t vm_flags = vma->vm_flags;
-
- if (vm_flags & (VM_IO | VM_PFNMAP))
- return -EFAULT;
-
- if (gup_flags & FOLL_WRITE) {
- if (!(vm_flags & VM_WRITE)) {
- if (!(gup_flags & FOLL_FORCE))
- return -EFAULT;
- /*
- * We used to let the write,force case do COW in a
- * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
- * set a breakpoint in a read-only mapping of an
- * executable, without corrupting the file (yet only
- * when that file had been opened for writing!).
- * Anon pages in shared mappings are surprising: now
- * just reject it.
- */
- if (!is_cow_mapping(vm_flags)) {
- WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
- return -EFAULT;
- }
- }
- } else if (!(vm_flags & VM_READ)) {
- if (!(gup_flags & FOLL_FORCE))
- return -EFAULT;
- /*
- * Is there actually any vma we can reach here which does not
- * have VM_MAYREAD set?
- */
- if (!(vm_flags & VM_MAYREAD))
- return -EFAULT;
- }
- return 0;
-}
-
-/**
- * __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * __get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * __get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
- * the page is written to, set_page_dirty (or set_page_dirty_lock, as
- * appropriate) must be called after the page is finished with, and
- * before put_page is called.
- *
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
- * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0.
- *
- * In most cases, get_user_pages or get_user_pages_fast should be used
- * instead of __get_user_pages. __get_user_pages should be used only if
- * you need some special @gup_flags.
- */
-long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
-{
- long i = 0;
- unsigned int page_mask;
- struct vm_area_struct *vma = NULL;
-
- if (!nr_pages)
- return 0;
-
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
-
- /*
- * If FOLL_FORCE is set then do not force a full fault as the hinting
- * fault information is unrelated to the reference behaviour of a task
- * using the address space
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
- do {
- struct page *page;
- unsigned int foll_flags = gup_flags;
- unsigned int page_increm;
-
- /* first iteration or cross vma bound */
- if (!vma || start >= vma->vm_end) {
- vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(mm, start)) {
- int ret;
- ret = get_gate_page(mm, start & PAGE_MASK,
- gup_flags, &vma,
- pages ? &pages[i] : NULL);
- if (ret)
- return i ? : ret;
- page_mask = 0;
- goto next_page;
- }
-
- if (!vma || check_vma_flags(vma, gup_flags))
- return i ? : -EFAULT;
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i,
- gup_flags);
- continue;
- }
- }
-retry:
- /*
- * If we have a pending SIGKILL, don't keep faulting pages and
- * potentially allocating memory.
- */
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
- cond_resched();
- page = follow_page_mask(vma, start, foll_flags, &page_mask);
- if (!page) {
- int ret;
- ret = faultin_page(tsk, vma, start, &foll_flags,
- nonblocking);
- switch (ret) {
- case 0:
- goto retry;
- case -EFAULT:
- case -ENOMEM:
- case -EHWPOISON:
- return i ? i : ret;
- case -EBUSY:
- return i;
- case -ENOENT:
- goto next_page;
- }
- BUG();
- }
- if (IS_ERR(page))
- return i ? i : PTR_ERR(page);
- if (pages) {
- pages[i] = page;
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- page_mask = 0;
- }
-next_page:
- if (vmas) {
- vmas[i] = vma;
- page_mask = 0;
- }
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
- if (page_increm > nr_pages)
- page_increm = nr_pages;
- i += page_increm;
- start += page_increm * PAGE_SIZE;
- nr_pages -= page_increm;
- } while (nr_pages);
- return i;
-}
-EXPORT_SYMBOL(__get_user_pages);
-
-/*
- * fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @address: user address
- * @fault_flags:flags to pass down to handle_mm_fault()
- *
- * This is meant to be called in the specific scenario where for locking reasons
- * we try to access user memory in atomic context (within a pagefault_disable()
- * section), this returns -EFAULT, and we want to resolve the user fault before
- * trying again.
- *
- * Typically this is meant to be used by the futex code.
- *
- * The main difference with get_user_pages() is that this function will
- * unconditionally call handle_mm_fault() which will in turn perform all the
- * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
- *
- * This is important for some architectures where those bits also gate the
- * access permission to the page because they are maintained in software. On
- * such architectures, gup() will not be enough to make a subsequent access
- * succeed.
- *
- * This should be called with the mm_sem held for read.
- */
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
-{
- struct vm_area_struct *vma;
- vm_flags_t vm_flags;
- int ret;
-
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
- return -EFAULT;
-
- vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
- if (!(vm_flags & vma->vm_flags))
- return -EFAULT;
-
- ret = handle_mm_fault(mm, vma, address, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return -EHWPOISON;
- if (ret & VM_FAULT_SIGBUS)
- return -EFAULT;
- BUG();
- }
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
- return 0;
-}
-
-/*
- * get_user_pages() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to by the caller
- * @force: whether to force access even when user mapping is currently
- * protected (but never forces write access to shared mapping).
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If write=0, the page must not be written to. If the page is written to,
- * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
- * after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- */
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
-{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
-}
-EXPORT_SYMBOL(get_user_pages);
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_sem, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33514d88fef..6001ee6347a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -5,8 +5,6 @@
* the COPYING file in the top-level directory.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/highmem.h>
@@ -22,19 +20,17 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/migrate.h>
-#include <linux/hashtable.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
/*
- * By default transparent hugepage support is disabled in order that avoid
- * to risk increase the memory footprint of applications without a guaranteed
- * benefit. When transparent hugepage support is enabled, is for all mappings,
- * and khugepaged scans all mappings.
- * Defrag is invoked by khugepaged hugepage allocations and by page faults
- * for all hugepage allocations.
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -66,11 +62,12 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
-#define MM_SLOTS_HASH_BITS 10
-static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
static struct kmem_cache *mm_slot_cache __read_mostly;
/**
@@ -108,6 +105,7 @@ static int set_recommended_min_free_kbytes(void)
struct zone *zone;
int nr_zones = 0;
unsigned long recommended_min;
+ extern int min_free_kbytes;
if (!khugepaged_enabled())
return 0;
@@ -132,14 +130,8 @@ static int set_recommended_min_free_kbytes(void)
(unsigned long) nr_free_buffer_pages() / 20);
recommended_min <<= (PAGE_SHIFT-10);
- if (recommended_min > min_free_kbytes) {
- if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
- min_free_kbytes, recommended_min);
-
+ if (recommended_min > min_free_kbytes)
min_free_kbytes = recommended_min;
- }
setup_per_zone_wmarks();
return 0;
}
@@ -153,7 +145,8 @@ static int start_khugepaged(void)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
if (unlikely(IS_ERR(khugepaged_thread))) {
- pr_err("khugepaged: kthread_run(khugepaged) failed\n");
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
}
@@ -171,34 +164,35 @@ static int start_khugepaged(void)
}
static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
+static unsigned long huge_zero_pfn __read_mostly;
-static inline bool is_huge_zero_page(struct page *page)
+static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- return ACCESS_ONCE(huge_zero_page) == page;
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
}
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
- return is_huge_zero_page(pmd_page(pmd));
+ return is_huge_zero_pfn(pmd_pfn(pmd));
}
-static struct page *get_huge_zero_page(void)
+static unsigned long get_huge_zero_page(void)
{
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return ACCESS_ONCE(huge_zero_page);
+ return ACCESS_ONCE(huge_zero_pfn);
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_page) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
- return NULL;
+ return 0;
}
count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
- if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
preempt_enable();
__free_page(zero_page);
goto retry;
@@ -207,7 +201,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return ACCESS_ONCE(huge_zero_page);
+ return ACCESS_ONCE(huge_zero_pfn);
}
static void put_huge_zero_page(void)
@@ -219,29 +213,24 @@ static void put_huge_zero_page(void)
BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- /* we can free zero page only if last reference remains */
- return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
-}
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
-{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
- struct page *zero_page = xchg(&huge_zero_page, NULL);
- BUG_ON(zero_page == NULL);
- __free_page(zero_page);
- return HPAGE_PMD_NR;
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
}
return 0;
}
static struct shrinker huge_zero_page_shrinker = {
- .count_objects = shrink_huge_zero_page_count,
- .scan_objects = shrink_huge_zero_page_scan,
+ .shrink = shrink_huge_zero_page,
.seeks = DEFAULT_SEEKS,
};
@@ -430,7 +419,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
+ err = strict_strtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -457,7 +446,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
+ err = strict_strtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -483,7 +472,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
int err;
unsigned long pages;
- err = kstrtoul(buf, 10, &pages);
+ err = strict_strtoul(buf, 10, &pages);
if (err || !pages || pages > UINT_MAX)
return -EINVAL;
@@ -551,7 +540,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
int err;
unsigned long max_ptes_none;
- err = kstrtoul(buf, 10, &max_ptes_none);
+ err = strict_strtoul(buf, 10, &max_ptes_none);
if (err || max_ptes_none > HPAGE_PMD_NR-1)
return -EINVAL;
@@ -585,19 +574,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
- pr_err("failed to create transparent hugepage kobject\n");
+ printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
if (err) {
- pr_err("failed to register transparent hugepage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto delete_obj;
}
err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
if (err) {
- pr_err("failed to register transparent hugepage group\n");
+ printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n");
goto remove_hp_group;
}
@@ -645,6 +634,12 @@ static int __init hugepage_init(void)
if (err)
goto out;
+ err = mm_slots_hash_init();
+ if (err) {
+ khugepaged_slab_free();
+ goto out;
+ }
+
register_shrinker(&huge_zero_page_shrinker);
/*
@@ -662,7 +657,7 @@ out:
hugepage_exit_sysfs(hugepage_kobj);
return err;
}
-subsys_initcall(hugepage_init);
+module_init(hugepage_init)
static int __init setup_transparent_hugepage(char *str)
{
@@ -690,7 +685,8 @@ static int __init setup_transparent_hugepage(char *str)
}
out:
if (!ret)
- pr_warn("transparent_hugepage= cannot parse, ignored\n");
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
@@ -702,10 +698,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
-static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
{
pmd_t entry;
- entry = mk_pmd(page, prot);
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
return entry;
}
@@ -716,37 +713,36 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
- spinlock_t *ptl;
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
clear_huge_page(page, haddr, HPAGE_PMD_NR);
- /*
- * The memory barrier inside __SetPageUptodate makes sure that
- * clear_huge_page writes become visible before the set_pmd_at()
- * write.
- */
__SetPageUptodate(page);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
} else {
pmd_t entry;
- entry = mk_huge_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = mk_huge_pmd(page, vma);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
page_add_new_anon_rmap(page, vma, haddr);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&mm->nr_ptes);
- spin_unlock(ptl);
+ mm->nr_ptes++;
+ spin_unlock(&mm->page_table_lock);
}
return 0;
@@ -766,20 +762,27 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-/* Caller must hold page table lock. */
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
- struct page *zero_page)
+ unsigned long zero_pfn)
{
pmd_t entry;
if (!pmd_none(*pmd))
return false;
- entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- atomic_long_inc(&mm->nr_ptes);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
return true;
}
@@ -789,65 +792,83 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return VM_FAULT_FALLBACK;
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma)))
- return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE) &&
- transparent_hugepage_use_zero_page()) {
- spinlock_t *ptl;
- pgtable_t pgtable;
- struct page *zero_page;
- bool set;
- pgtable = pte_alloc_one(mm, haddr);
- if (unlikely(!pgtable))
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
- zero_page = get_huge_zero_page();
- if (unlikely(!zero_page)) {
- pte_free(mm, pgtable);
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ unsigned long zero_pfn;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
+ goto out;
}
- ptl = pmd_lock(mm, pmd);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(ptl);
- if (!set) {
- pte_free(mm, pgtable);
- put_huge_zero_page();
+ count_vm_event(THP_FAULT_ALLOC);
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+ page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
}
+
return 0;
}
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- if (unlikely(!page)) {
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
- mem_cgroup_uncharge_page(page);
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- return 0;
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
{
- spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable;
@@ -858,9 +879,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
- dst_ptl = pmd_lock(dst_mm, dst_pmd);
- src_ptl = pmd_lockptr(src_mm, src_pmd);
- spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pmd = *src_pmd;
@@ -869,51 +889,50 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * When page table lock is held, the huge zero pmd should not be
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
+ unsigned long zero_pfn;
bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = get_huge_zero_page();
+ zero_pfn = get_huge_zero_page();
set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
+ zero_pfn);
BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
}
-
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
pte_free(dst_mm, pgtable);
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
goto out;
}
src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ VM_BUG_ON(!PageHead(src_page));
get_page(src_page);
page_dup_rmap(src_page);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- atomic_long_inc(&dst_mm->nr_ptes);
+ pgtable_trans_huge_deposit(dst_mm, pgtable);
+ dst_mm->nr_ptes++;
ret = 0;
out_unlock:
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
out:
return ret;
}
@@ -924,11 +943,10 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{
- spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
@@ -938,38 +956,81 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);
unlock:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
}
-}
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_page;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
}
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
+ inc_mm_counter(mm, MM_ANONPAGES);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+out_free_page:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
}
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
@@ -979,7 +1040,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page *page,
unsigned long haddr)
{
- spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -999,7 +1059,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_charge_anon(pages[i], mm,
+ mem_cgroup_newpage_charge(pages[i], mm,
GFP_KERNEL))) {
if (pages[i])
put_page(pages[i]);
@@ -1026,15 +1086,15 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON(!PageHead(page));
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1052,7 +1112,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1063,7 +1123,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1078,24 +1138,22 @@ out_free_pages:
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
- spinlock_t *ptl;
int ret = 0;
struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
- ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
- VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -1105,8 +1163,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
- spin_unlock(ptl);
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1116,37 +1174,32 @@ alloc:
new_page = NULL;
if (unlikely(!new_page)) {
- if (!page) {
- split_huge_page_pmd(vma, address, pmd);
- ret |= VM_FAULT_FALLBACK;
+ count_vm_event(THP_FAULT_FALLBACK);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, orig_pmd, haddr);
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
- if (ret & VM_FAULT_OOM) {
+ if (ret & VM_FAULT_OOM)
split_huge_page(page);
- ret |= VM_FAULT_FALLBACK;
- }
- put_user_huge_page(page);
+ put_page(page);
}
- count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
- if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
if (page) {
split_huge_page(page);
- put_user_huge_page(page);
- } else
- split_huge_page_pmd(vma, address, pmd);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
+ put_page(page);
+ }
+ ret |= VM_FAULT_OOM;
goto out;
}
- count_vm_event(THP_FAULT_ALLOC);
-
- if (!page)
+ if (is_huge_zero_pmd(orig_pmd))
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
@@ -1156,39 +1209,38 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
} else {
pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = mk_huge_pmd(new_page, vma);
pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (!page) {
+ if (is_huge_zero_pmd(orig_pmd)) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON(!PageHead(page));
page_remove_rmap(page);
put_page(page);
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -1200,21 +1252,13 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
- assert_spin_locked(pmd_lockptr(mm, pmd));
+ assert_spin_locked(&mm->page_table_lock);
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
- /* Avoid dumping huge zero page */
- if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
- return ERR_PTR(-EFAULT);
-
- /* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
- goto out;
-
page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON(!PageHead(page));
if (flags & FOLL_TOUCH) {
pmd_t _pmd;
/*
@@ -1226,9 +1270,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
- update_mmu_cache_pmd(vma, addr, pmd);
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
@@ -1239,7 +1281,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
}
}
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
get_page_foll(page);
@@ -1251,157 +1293,101 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
- spinlock_t *ptl;
- struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
- int target_nid, last_cpupid = -1;
- bool page_locked;
- bool migrated = false;
- int flags = 0;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
- ptl = pmd_lock(mm, pmdp);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
- /*
- * If there are potential migrations, wait for completion and retry
- * without disrupting NUMA hinting information. Do not relock and
- * check_same as the page may no longer be mapped.
- */
- if (unlikely(pmd_trans_migrating(*pmdp))) {
- spin_unlock(ptl);
- wait_migrate_huge_page(vma->anon_vma, pmdp);
- goto out;
- }
-
page = pmd_page(pmd);
- BUG_ON(is_huge_zero_page(page));
- page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
+ get_page(page);
+ current_nid = page_to_nid(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid) {
+ if (current_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- flags |= TNF_FAULT_LOCAL;
- }
- /*
- * Avoid grouping on DSO/COW pages in specific and RO pages
- * in general, RO pages shouldn't hurt as much anyway since
- * they can be in shared cache state.
- */
- if (!pmd_write(pmd))
- flags |= TNF_NO_GROUP;
-
- /*
- * Acquire the page lock to serialise THP migrations but avoid dropping
- * page_table_lock if at all possible
- */
- page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- /* If the page was locked, there are no parallel migrations */
- if (page_locked)
- goto clear_pmdnuma;
- }
-
- /* Migration could have started since the pmd_trans_migrating check */
- if (!page_locked) {
- spin_unlock(ptl);
- wait_on_page_locked(page);
- page_nid = -1;
- goto out;
+ put_page(page);
+ goto clear_pmdnuma;
}
- /*
- * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
- * to serialises splits
- */
- get_page(page);
- spin_unlock(ptl);
- anon_vma = page_lock_anon_vma_read(page);
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
- /* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(ptl);
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
goto out_unlock;
}
+ spin_unlock(&mm->page_table_lock);
- /* Bail if we fail to protect against THP splits for any reason */
- if (unlikely(!anon_vma)) {
- put_page(page);
- page_nid = -1;
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
goto clear_pmdnuma;
}
- /*
- * Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
- */
- spin_unlock(ptl);
- migrated = migrate_misplaced_transhuge_page(mm, vma,
- pmdp, pmd, addr, page, target_nid);
- if (migrated) {
- flags |= TNF_MIGRATED;
- page_nid = target_nid;
- }
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
- goto out;
clear_pmdnuma:
- BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
- unlock_page(page);
-out_unlock:
- spin_unlock(ptl);
-
-out:
- if (anon_vma)
- page_unlock_anon_vma_read(anon_vma);
-
- if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+ if (page_locked)
+ unlock_page(page);
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
return 0;
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pmdp related
- * operations.
- */
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(!PageHead(page), page);
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1414,15 +1400,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec)
{
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1435,7 +1420,6 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
- spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
@@ -1456,72 +1440,41 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- /*
- * We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
- */
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
+ ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
- new_ptl = pmd_lockptr(mm, new_pmd);
- if (new_ptl != old_ptl)
- spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-
- if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
- pgtable_t pgtable;
- pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
- pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
- }
- set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- if (new_ptl != old_ptl)
- spin_unlock(new_ptl);
- spin_unlock(old_ptl);
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ spin_unlock(&mm->page_table_lock);
}
out:
return ret;
}
-/*
- * Returns
- * - 0 if PMD could not be locked
- * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- * - HPAGE_PMD_NR is protections changed and TLB flush necessary
- */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
- ret = 1;
+ entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
!pmd_numa(*pmd)) {
- pmdp_set_numa(mm, addr, pmd);
- ret = HPAGE_PMD_NR;
+ entry = pmd_mknuma(entry);
}
}
- spin_unlock(ptl);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ ret = 1;
}
return ret;
@@ -1534,13 +1487,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
- spinlock_t **ptl)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
- *ptl = pmd_lock(vma->vm_mm, pmd);
+ spin_lock(&vma->vm_mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1549,44 +1501,27 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
return 1;
}
}
- spin_unlock(*ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
+ enum page_check_address_pmd_flag flag)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
+ pmd_t *pmd, *ret = NULL;
if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
+ goto out;
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+ if (pmd_none(*pmd))
+ goto out;
if (pmd_page(*pmd) != page)
- goto unlock;
+ goto out;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1596,15 +1531,14 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto unlock;
+ goto out;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
- return pmd;
+ ret = pmd;
}
-unlock:
- spin_unlock(*ptl);
- return NULL;
+out:
+ return ret;
}
static int __split_huge_page_splitting(struct page *page,
@@ -1612,7 +1546,6 @@ static int __split_huge_page_splitting(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd;
int ret = 0;
/* For mmu_notifiers */
@@ -1620,8 +1553,9 @@ static int __split_huge_page_splitting(struct page *page,
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1632,15 +1566,14 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
}
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
+static void __split_huge_page_refcount(struct page *page)
{
int i;
struct zone *zone = page_zone(page);
@@ -1693,9 +1626,7 @@ static void __split_huge_page_refcount(struct page *page,
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
+ (1L << PG_uptodate)));
page_tail->flags |= (1L << PG_dirty);
/* clear PageTail before overwriting first_page */
@@ -1721,19 +1652,20 @@ static void __split_huge_page_refcount(struct page *page,
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
- lru_add_page_tail(page, page_tail, lruvec, list);
+ lru_add_page_tail(page, page_tail, lruvec);
}
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
ClearPageCompound(page);
compound_unlock(page);
@@ -1764,16 +1696,16 @@ static int __split_huge_page_map(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd, _pmd;
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
+ spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
haddr = address;
@@ -1826,16 +1758,15 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
return ret;
}
/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
+ struct anon_vma *anon_vma)
{
int mapcount, mapcount2;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1861,13 +1792,12 @@ static void __split_huge_page(struct page *page,
* the newly established pmd of the child later during the
* walk, to be able to set it as pmd_trans_splitting too.
*/
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
- __split_huge_page_refcount(page, list);
+ __split_huge_page_refcount(page);
mapcount2 = 0;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
@@ -1876,26 +1806,18 @@ static void __split_huge_page(struct page *page,
BUG_ON(is_vma_temporary_stack(vma));
mapcount2 += __split_huge_page_map(page, vma, addr);
}
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
}
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
+int split_huge_page(struct page *page)
{
struct anon_vma *anon_vma;
int ret = 1;
- BUG_ON(is_huge_zero_page(page));
+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
/*
@@ -1915,38 +1837,33 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
+ __split_huge_page(page, anon_vma);
count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
out_unlock:
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
put_anon_vma(anon_vma);
out:
return ret;
}
-#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
+ struct mm_struct *mm = vma->vm_mm;
+
switch (advice) {
case MADV_HUGEPAGE:
-#ifdef CONFIG_S390
- /*
- * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
- * can't handle this properly after s390_enable_sie, so we simply
- * ignore the madvise to prevent qemu from causing a SIGSEGV.
- */
- if (mm_has_pgste(vma->vm_mm))
- return 0;
-#endif
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
+ if (mm->def_flags & VM_NOHUGEPAGE)
+ return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
@@ -1987,6 +1904,12 @@ static int __init khugepaged_slab_init(void)
return 0;
}
+static void __init khugepaged_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ mm_slot_cache = NULL;
+}
+
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
@@ -1999,22 +1922,47 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
kmem_cache_free(mm_slot_cache, mm_slot);
}
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+ mm_slots_hash = NULL;
+}
+#endif
+
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
- hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, hash) {
if (mm == mm_slot->mm)
return mm_slot;
-
+ }
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
+ hlist_add_head(&mm_slot->hash, bucket);
}
static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2083,7 +2031,7 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
- hash_del(&mm_slot->hash);
+ hlist_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
free = 1;
}
@@ -2146,9 +2094,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (unlikely(!page))
goto out;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- VM_BUG_ON_PAGE(!PageAnon(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
@@ -2171,8 +2119,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
@@ -2202,7 +2150,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
@@ -2231,34 +2179,7 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
-static int khugepaged_node_load[MAX_NUMNODES];
-
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
-{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
- int nid, target_node = 0, max_value = 0;
-
- /* find first node with max normal pages hit */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
- target_node = nid;
- }
-
- /* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
- target_node = nid;
- break;
- }
-
- last_khugepaged_target_node = target_node;
- return target_node;
-}
-
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2281,7 +2202,7 @@ static struct page
struct vm_area_struct *vma, unsigned long address,
int node)
{
- VM_BUG_ON_PAGE(*hpage, *hpage);
+ VM_BUG_ON(*hpage);
/*
* Allocate the page while the vma is still valid and under
* the mmap_sem read mode so there is no memory allocation
@@ -2292,8 +2213,9 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2309,17 +2231,6 @@ static struct page
return *hpage;
}
#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2386,7 +2297,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
- spinlock_t *pmd_ptl, *pte_ptl;
+ spinlock_t *ptl;
int isolated;
unsigned long hstart, hend;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2399,7 +2310,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!new_page)
return;
- if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
return;
/*
@@ -2412,8 +2323,6 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
vma = find_vma(mm, address);
- if (!vma)
- goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2423,16 +2332,18 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
+ if (pmd_trans_huge(*pmd))
+ goto out;
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
+ ptl = pte_lockptr(mm, pmd);
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2440,25 +2351,20 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(pmd_ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- spin_lock(pte_ptl);
+ spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
- spin_unlock(pte_ptl);
+ spin_unlock(ptl);
if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(pmd_ptl);
+ spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
- /*
- * We can only use set_pmd_at when establishing
- * hugepmds and never for establishing regular pmds that
- * points to regular pagetables. Use pmd_populate for that
- */
- pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(vma->anon_vma);
goto out;
}
@@ -2466,15 +2372,14 @@ static void collapse_huge_page(struct mm_struct *mm,
* All pages are isolated and locked so anon_vma rmap
* can't run anymore.
*/
- anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_unlock(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ _pmd = mk_huge_pmd(new_page, vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2483,13 +2388,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();
- spin_lock(pmd_ptl);
+ spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(pmd_ptl);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ spin_unlock(&mm->page_table_lock);
*hpage = NULL;
@@ -2514,15 +2419,16 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
struct page *page;
unsigned long _address;
spinlock_t *ptl;
- int node = NUMA_NO_NODE;
+ int node = -1;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
if (!pmd)
goto out;
+ if (pmd_trans_huge(*pmd))
+ goto out;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2539,14 +2445,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Record which node the original page is from and save this
- * information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
- * hit record.
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
*/
- node = page_to_nid(page);
- khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ if (node == -1)
+ node = page_to_nid(page);
+ VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
@@ -2560,11 +2465,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (ret)
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
- }
out:
return ret;
}
@@ -2577,7 +2480,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
if (khugepaged_test_exit(mm)) {
/* free mm_slot */
- hash_del(&mm_slot->hash);
+ hlist_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
/*
@@ -2769,7 +2672,7 @@ static int khugepaged(void *none)
struct mm_slot *mm_slot;
set_freezable();
- set_user_nice(current, MAX_NICE);
+ set_user_nice(current, 19);
while (!kthread_should_stop()) {
khugepaged_do_scan();
@@ -2796,7 +2699,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -2816,7 +2719,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
- spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2827,37 +2729,29 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
+ VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
put_page(page);
-
- /*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
- */
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ BUG_ON(pmd_trans_huge(*pmd));
}
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
@@ -2873,22 +2767,12 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2024bbd573d..4f3ea0b1e57 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,7 +13,6 @@
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
-#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/bootmem.h>
@@ -22,8 +21,6 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/page-isolation.h>
-#include <linux/jhash.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -36,6 +33,7 @@
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
@@ -50,18 +48,10 @@ static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
/*
- * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
- * free_huge_pages, and surplus_huge_pages.
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
DEFINE_SPINLOCK(hugetlb_lock);
-/*
- * Serializes faults on the same logical page. This is used to
- * prevent spurious OOMs when the hugepage pool is fully utilized.
- */
-static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
-
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -137,15 +127,22 @@ static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
- return subpool_inode(file_inode(vma->vm_file));
+ return subpool_inode(vma->vm_file->f_dentry->d_inode);
}
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
*
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ * down_write(&mm->mmap_sem);
+ * or
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -153,12 +150,10 @@ struct file_region {
long to;
};
-static long region_add(struct resv_map *resv, long f, long t)
+static long region_add(struct list_head *head, long f, long t)
{
- struct list_head *head = &resv->regions;
struct file_region *rg, *nrg, *trg;
- spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -188,18 +183,14 @@ static long region_add(struct resv_map *resv, long f, long t)
}
nrg->from = f;
nrg->to = t;
- spin_unlock(&resv->lock);
return 0;
}
-static long region_chg(struct resv_map *resv, long f, long t)
+static long region_chg(struct list_head *head, long f, long t)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg = NULL;
+ struct file_region *rg, *nrg;
long chg = 0;
-retry:
- spin_lock(&resv->lock);
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -209,21 +200,15 @@ retry:
* Subtle, allocate a new region at the position but make it zero
* size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) {
- if (!nrg) {
- spin_unlock(&resv->lock);
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (!nrg)
- return -ENOMEM;
-
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- goto retry;
- }
-
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
list_add(&nrg->link, rg->link.prev);
- chg = t - f;
- goto out_nrg;
+
+ return t - f;
}
/* Round our left edge to the current segment if it encloses us. */
@@ -236,7 +221,7 @@ retry:
if (&rg->link == head)
break;
if (rg->from > t)
- goto out;
+ return chg;
/* We overlap with this area, if it extends further than
* us then we must extend ourselves. Account for its
@@ -247,30 +232,20 @@ retry:
}
chg -= rg->to - rg->from;
}
-
-out:
- spin_unlock(&resv->lock);
- /* We already know we raced and no longer need the new region */
- kfree(nrg);
- return chg;
-out_nrg:
- spin_unlock(&resv->lock);
return chg;
}
-static long region_truncate(struct resv_map *resv, long end)
+static long region_truncate(struct list_head *head, long end)
{
- struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
long chg = 0;
- spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
list_for_each_entry(rg, head, link)
if (end <= rg->to)
break;
if (&rg->link == head)
- goto out;
+ return 0;
/* If we are in the middle of a region then adjust it. */
if (end > rg->from) {
@@ -287,19 +262,14 @@ static long region_truncate(struct resv_map *resv, long end)
list_del(&rg->link);
kfree(rg);
}
-
-out:
- spin_unlock(&resv->lock);
return chg;
}
-static long region_count(struct resv_map *resv, long f, long t)
+static long region_count(struct list_head *head, long f, long t)
{
- struct list_head *head = &resv->regions;
struct file_region *rg;
long chg = 0;
- spin_lock(&resv->lock);
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
long seg_from;
@@ -315,7 +285,6 @@ static long region_count(struct resv_map *resv, long f, long t)
chg += seg_to - seg_from;
}
- spin_unlock(&resv->lock);
return chg;
}
@@ -350,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
hstate = hstate_vma(vma);
- return 1UL << huge_page_shift(hstate);
+ return 1UL << (hstate->order + PAGE_SHIFT);
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -406,46 +375,39 @@ static void set_vma_private_data(struct vm_area_struct *vma,
vma->vm_private_data = (void *)value;
}
-struct resv_map *resv_map_alloc(void)
+struct resv_map {
+ struct kref refs;
+ struct list_head regions;
+};
+
+static struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
if (!resv_map)
return NULL;
kref_init(&resv_map->refs);
- spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
return resv_map;
}
-void resv_map_release(struct kref *ref)
+static void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
/* Clear out any active regions before we release the map. */
- region_truncate(resv_map, 0);
+ region_truncate(&resv_map->regions, 0);
kfree(resv_map);
}
-static inline struct resv_map *inode_resv_map(struct inode *inode)
-{
- return inode->i_mapping->private_data;
-}
-
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (vma->vm_flags & VM_MAYSHARE) {
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
-
- return inode_resv_map(inode);
-
- } else {
+ if (!(vma->vm_flags & VM_MAYSHARE))
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
- }
+ return NULL;
}
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -472,6 +434,25 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_NORESERVE)
+ return;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /* Shared mappings always use reserves */
+ h->resv_huge_pages--;
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ h->resv_huge_pages--;
+ }
+}
+
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
@@ -481,38 +462,49 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
}
/* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static int vma_has_reserves(struct vm_area_struct *vma)
{
- if (vma->vm_flags & VM_NORESERVE) {
- /*
- * This address is already reserved by other process(chg == 0),
- * so, we should decrement reserved count. Without decrementing,
- * reserve count remains after releasing inode, because this
- * allocated page will go into page cache and is regarded as
- * coming from reserved pool in releasing step. Currently, we
- * don't have any other solution to deal with this situation
- * properly, so add work-around here.
- */
- if (vma->vm_flags & VM_MAYSHARE && chg == 0)
- return 1;
- else
- return 0;
- }
-
- /* Shared mappings always use reserves */
if (vma->vm_flags & VM_MAYSHARE)
return 1;
-
- /*
- * Only the process that called mmap() has reserves for
- * private mappings.
- */
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 1;
-
return 0;
}
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
@@ -525,15 +517,9 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
- if (!is_migrate_isolate_page(page))
- break;
- /*
- * if 'non-isolated free hugepage' not found on the list,
- * the allocation fails.
- */
- if (&h->hugepage_freelists[nid] == &page->lru)
+ if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
@@ -541,19 +527,9 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
return page;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepages_treat_as_movable || hugepage_migration_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
- unsigned long address, int avoid_reserve,
- long chg)
+ unsigned long address, int avoid_reserve)
{
struct page *page = NULL;
struct mempolicy *mpol;
@@ -563,12 +539,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct zoneref *z;
unsigned int cpuset_mems_cookie;
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask, &mpol, &nodemask);
/*
* A child process with MAP_PRIVATE mappings created by their parent
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma, chg) &&
+ if (!vma_has_reserves(vma) &&
h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
@@ -576,273 +556,47 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = huge_zonelist(vma, address,
- htlb_alloc_mask(h), &mpol, &nodemask);
-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
page = dequeue_huge_page_node(h, zone_to_nid(zone));
if (page) {
- if (avoid_reserve)
- break;
- if (!vma_has_reserves(vma, chg))
- break;
-
- SetPagePrivate(page);
- h->resv_huge_pages--;
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
break;
}
}
}
mpol_cond_put(mpol);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
err:
+ mpol_cond_put(mpol);
return NULL;
}
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed. Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
- VM_BUG_ON(nid >= MAX_NUMNODES);
-
- return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- if (!node_isset(nid, *nodes_allowed))
- nid = next_node_allowed(nid, nodes_allowed);
- return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
- h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page. Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
- h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
- nr_nodes--)
-
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_free(hs, mask)) || 1); \
- nr_nodes--)
-
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
-static void destroy_compound_gigantic_page(struct page *page,
- unsigned long order)
-{
- int i;
- int nr_pages = 1 << order;
- struct page *p = page + 1;
-
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
- __ClearPageTail(p);
- set_page_refcounted(p);
- p->first_page = NULL;
- }
-
- set_compound_order(page, 0);
- __ClearPageHead(page);
-}
-
-static void free_gigantic_page(struct page *page, unsigned order)
-{
- free_contig_range(page_to_pfn(page), 1 << order);
-}
-
-static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages)
-{
- unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-}
-
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
- unsigned long nr_pages)
-{
- unsigned long i, end_pfn = start_pfn + nr_pages;
- struct page *page;
-
- for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
- return false;
-
- page = pfn_to_page(i);
-
- if (PageReserved(page))
- return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
- }
-
- return true;
-}
-
-static bool zone_spans_last_pfn(const struct zone *zone,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long last_pfn = start_pfn + nr_pages - 1;
- return zone_spans_pfn(zone, last_pfn);
-}
-
-static struct page *alloc_gigantic_page(int nid, unsigned order)
-{
- unsigned long nr_pages = 1 << order;
- unsigned long ret, pfn, flags;
- struct zone *z;
-
- z = NODE_DATA(nid)->node_zones;
- for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
- spin_lock_irqsave(&z->lock, flags);
-
- pfn = ALIGN(z->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(pfn, nr_pages)) {
- /*
- * We release the zone lock here because
- * alloc_contig_range() will also lock the zone
- * at some point. If there's an allocation
- * spinning on this lock, it may win the race
- * and cause alloc_contig_range() to fail...
- */
- spin_unlock_irqrestore(&z->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages);
- if (!ret)
- return pfn_to_page(pfn);
- spin_lock_irqsave(&z->lock, flags);
- }
- pfn += nr_pages;
- }
-
- spin_unlock_irqrestore(&z->lock, flags);
- }
-
- return NULL;
-}
-
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
-
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
- struct page *page;
-
- page = alloc_gigantic_page(nid, huge_page_order(h));
- if (page) {
- prep_compound_gigantic_page(page, huge_page_order(h));
- prep_new_huge_page(h, page, nid);
- }
-
- return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- struct page *page = NULL;
- int nr_nodes, node;
-
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_gigantic_page_node(h, node);
- if (page)
- return 1;
- }
-
- return 0;
-}
-
-static inline bool gigantic_page_supported(void) { return true; }
-#else
-static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
-static inline void destroy_compound_gigantic_page(struct page *page,
- unsigned long order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
- nodemask_t *nodes_allowed) { return 0; }
-#endif
-
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return;
+ VM_BUG_ON(h->order >= MAX_ORDER);
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_private |
- 1 << PG_writeback);
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
- if (hstate_is_gigantic(h)) {
- destroy_compound_gigantic_page(page, huge_page_order(h));
- free_gigantic_page(page, huge_page_order(h));
- } else {
- arch_release_hugepage(page);
- __free_pages(page, huge_page_order(h));
- }
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
}
struct hstate *size_to_hstate(unsigned long size)
@@ -866,22 +620,16 @@ static void free_huge_page(struct page *page)
int nid = page_to_nid(page);
struct hugepage_subpool *spool =
(struct hugepage_subpool *)page_private(page);
- bool restore_reserve;
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- restore_reserve = PagePrivate(page);
- ClearPagePrivate(page);
spin_lock(&hugetlb_lock);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
- if (restore_reserve)
- h->resv_huge_pages++;
-
- if (h->surplus_huge_pages_node[nid]) {
+ if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -916,22 +664,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__SetPageHead(page);
- __ClearPageReserved(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
- /*
- * For gigantic hugepages allocated through bootmem at
- * boot, it's safer to be consistent with the not-gigantic
- * hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwse drivers using get_user_pages() to access tail
- * pages may get the reference counting wrong if they see
- * PG_reserved set on a tail page (despite the head page not
- * having PG_reserved set). Enforcing this consistency between
- * head and tail pages allows drivers to optimize away a check
- * on the head page when they need know if put_page() is needed
- * after get_user_pages().
- */
- __ClearPageReserved(p);
set_page_count(p, 0);
p->first_page = page;
}
@@ -944,49 +678,27 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
*/
int PageHuge(struct page *page)
{
+ compound_page_dtor *dtor;
+
if (!PageCompound(page))
return 0;
page = compound_head(page);
- return get_compound_page_dtor(page) == free_huge_page;
-}
-EXPORT_SYMBOL_GPL(PageHuge);
-
-/*
- * PageHeadHuge() only returns true for hugetlbfs head page, but not for
- * normal or transparent huge pages.
- */
-int PageHeadHuge(struct page *page_head)
-{
- if (!PageHead(page_head))
- return 0;
+ dtor = get_compound_page_dtor(page);
- return get_compound_page_dtor(page_head) == free_huge_page;
-}
-
-pgoff_t __basepage_index(struct page *page)
-{
- struct page *page_head = compound_head(page);
- pgoff_t index = page_index(page_head);
- unsigned long compound_idx;
-
- if (!PageHuge(page_head))
- return page_index(page);
-
- if (compound_order(page_head) >= MAX_ORDER)
- compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
- else
- compound_idx = page - page_head;
-
- return (index << compound_order(page_head)) + compound_idx;
+ return dtor == free_huge_page;
}
+EXPORT_SYMBOL_GPL(PageHuge);
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
+ if (h->order >= MAX_ORDER)
+ return NULL;
+
page = alloc_pages_exact_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
@@ -1000,19 +712,67 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
return page;
}
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
- int nr_nodes, node;
+ int start_nid;
+ int next_nid;
int ret = 0;
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page_node(h, node);
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
+ page = alloc_fresh_huge_page_node(h, next_nid);
if (page) {
ret = 1;
break;
}
- }
+ next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ } while (next_nid != start_nid);
if (ret)
count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -1023,6 +783,24 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
}
/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
* Free huge page from pool from next node to free.
* Attempt to keep persistent huge pages more or less
* balanced over allowed nodes.
@@ -1031,79 +809,46 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{
- int nr_nodes, node;
+ int start_nid;
+ int next_nid;
int ret = 0;
- for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
/*
* If we're returning unused surplus pages, only examine
* nodes with surplus pages.
*/
- if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
- !list_empty(&h->hugepage_freelists[node])) {
+ if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+ !list_empty(&h->hugepage_freelists[next_nid])) {
struct page *page =
- list_entry(h->hugepage_freelists[node].next,
+ list_entry(h->hugepage_freelists[next_nid].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
- h->free_huge_pages_node[node]--;
+ h->free_huge_pages_node[next_nid]--;
if (acct_surplus) {
h->surplus_huge_pages--;
- h->surplus_huge_pages_node[node]--;
+ h->surplus_huge_pages_node[next_nid]--;
}
update_and_free_page(h, page);
ret = 1;
break;
}
- }
+ next_nid = hstate_next_node_to_free(h, nodes_allowed);
+ } while (next_nid != start_nid);
return ret;
}
-/*
- * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages.
- */
-static void dissolve_free_huge_page(struct page *page)
-{
- spin_lock(&hugetlb_lock);
- if (PageHuge(page) && !page_count(page)) {
- struct hstate *h = page_hstate(page);
- int nid = page_to_nid(page);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- update_and_free_page(h, page);
- }
- spin_unlock(&hugetlb_lock);
-}
-
-/*
- * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
- * make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
- */
-void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned int order = 8 * sizeof(void *);
- unsigned long pfn;
- struct hstate *h;
-
- /* Set scan step to minimum hugepage size */
- for_each_hstate(h)
- if (order > huge_page_order(h))
- order = huge_page_order(h);
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
- dissolve_free_huge_page(pfn_to_page(pfn));
-}
-
static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
unsigned int r_nid;
- if (hstate_is_gigantic(h))
+ if (h->order >= MAX_ORDER)
return NULL;
/*
@@ -1140,12 +885,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
spin_unlock(&hugetlb_lock);
if (nid == NUMA_NO_NODE)
- page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
else
page = alloc_pages_exact_node(nid,
- htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
@@ -1182,11 +927,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
- struct page *page = NULL;
+ struct page *page;
spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_node(h, nid);
+ page = dequeue_huge_page_node(h, nid);
spin_unlock(&hugetlb_lock);
if (!page)
@@ -1267,15 +1011,18 @@ retry:
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
free:
spin_unlock(&hugetlb_lock);
/* Free unnecessary surplus pages to the buddy allocator */
- list_for_each_entry_safe(page, tmp, &surplus_list, lru)
- put_page(page);
+ if (!list_empty(&surplus_list)) {
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ put_page(page);
+ }
+ }
spin_lock(&hugetlb_lock);
return ret;
@@ -1296,7 +1043,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
- if (hstate_is_gigantic(h))
+ if (h->order >= MAX_ORDER)
return;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1312,7 +1059,6 @@ static void return_unused_surplus_pages(struct hstate *h,
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
- cond_resched_lock(&hugetlb_lock);
}
}
@@ -1329,34 +1075,45 @@ static void return_unused_surplus_pages(struct hstate *h,
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct resv_map *resv;
- pgoff_t idx;
- long chg;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ return region_chg(&inode->i_mapping->private_list,
+ idx, idx + 1);
- resv = vma_resv_map(vma);
- if (!resv)
+ } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
return 1;
- idx = vma_hugecache_offset(h, vma, addr);
- chg = region_chg(resv, idx, idx + 1);
+ } else {
+ long err;
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
- if (vma->vm_flags & VM_MAYSHARE)
- return chg;
- else
- return chg < 0 ? chg : 0;
+ err = region_chg(&reservations->regions, idx, idx + 1);
+ if (err < 0)
+ return err;
+ return 0;
+ }
}
static void vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct resv_map *resv;
- pgoff_t idx;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
- resv = vma_resv_map(vma);
- if (!resv)
- return;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ region_add(&inode->i_mapping->private_list, idx, idx + 1);
- idx = vma_hugecache_offset(h, vma, addr);
- region_add(resv, idx, idx + 1);
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /* Mark this page used in the map. */
+ region_add(&reservations->regions, idx, idx + 1);
+ }
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1381,67 +1138,58 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1))
+ if (chg)
+ if (hugepage_subpool_get_pages(spool, chg))
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
- if (ret)
- goto out_subpool_put;
-
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
- if (!page) {
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page)
- goto out_uncharge_cgroup;
-
+ if (!page) {
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
list_move(&page->lru, &h->hugepage_activelist);
- /* Fall through */
+ spin_unlock(&hugetlb_lock);
}
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
- spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
return page;
-
-out_uncharge_cgroup:
- hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
-out_subpool_put:
- if (chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
- return ERR_PTR(-ENOSPC);
-}
-
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
- unsigned long addr, int avoid_reserve)
-{
- struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
- if (IS_ERR(page))
- page = NULL;
- return page;
}
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes, node;
+ int nr_nodes = nodes_weight(node_states[N_MEMORY]);
- for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+ while (nr_nodes) {
void *addr;
- addr = memblock_virt_alloc_try_nid_nopanic(
- huge_page_size(h), huge_page_size(h),
- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
+ addr = __alloc_bootmem_node_nopanic(
+ NODE_DATA(hstate_next_node_to_alloc(h,
+ &node_states[N_MEMORY])),
+ huge_page_size(h), huge_page_size(h), 0);
+
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1451,6 +1199,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
m = addr;
goto found;
}
+ nr_nodes--;
}
return 0;
@@ -1462,7 +1211,7 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void prep_compound_huge_page(struct page *page, int order)
{
if (unlikely(order > (MAX_ORDER - 1)))
prep_compound_gigantic_page(page, order);
@@ -1481,14 +1230,14 @@ static void __init gather_bootmem_prealloc(void)
#ifdef CONFIG_HIGHMEM
page = pfn_to_page(m->phys >> PAGE_SHIFT);
- memblock_free_late(__pa(m),
- sizeof(struct huge_bootmem_page));
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
#else
page = virt_to_page(m);
#endif
+ __ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
- WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
/*
* If we had gigantic hugepages allocated at boot time, we need
@@ -1496,8 +1245,8 @@ static void __init gather_bootmem_prealloc(void)
* fix confusing memory reports from free(1) and another
* side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, 1 << h->order);
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
}
}
@@ -1506,7 +1255,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
unsigned long i;
for (i = 0; i < h->max_huge_pages; ++i) {
- if (hstate_is_gigantic(h)) {
+ if (h->order >= MAX_ORDER) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1522,7 +1271,7 @@ static void __init hugetlb_init_hstates(void)
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
- if (!hstate_is_gigantic(h))
+ if (h->order < MAX_ORDER)
hugetlb_hstate_alloc_pages(h);
}
}
@@ -1544,7 +1293,8 @@ static void __init report_hugepages(void)
for_each_hstate(h) {
char buf[32];
- pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ printk(KERN_INFO "HugeTLB registered %s page size, "
+ "pre-allocated %ld pages\n",
memfmt(buf, huge_page_size(h)),
h->free_huge_pages);
}
@@ -1556,7 +1306,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
{
int i;
- if (hstate_is_gigantic(h))
+ if (h->order >= MAX_ORDER)
return;
for_each_node_mask(i, *nodes_allowed) {
@@ -1589,28 +1339,48 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{
- int nr_nodes, node;
+ int start_nid, next_nid;
+ int ret = 0;
VM_BUG_ON(delta != -1 && delta != 1);
- if (delta < 0) {
- for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- if (h->surplus_huge_pages_node[node])
- goto found;
+ if (delta < 0)
+ start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+ else
+ start_nid = hstate_next_node_to_free(h, nodes_allowed);
+ next_nid = start_nid;
+
+ do {
+ int nid = next_nid;
+ if (delta < 0) {
+ /*
+ * To shrink on this node, there must be a surplus page
+ */
+ if (!h->surplus_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_alloc(h,
+ nodes_allowed);
+ continue;
+ }
}
- } else {
- for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
- if (h->surplus_huge_pages_node[node] <
- h->nr_huge_pages_node[node])
- goto found;
+ if (delta > 0) {
+ /*
+ * Surplus cannot exceed the total number of pages
+ */
+ if (h->surplus_huge_pages_node[nid] >=
+ h->nr_huge_pages_node[nid]) {
+ next_nid = hstate_next_node_to_free(h,
+ nodes_allowed);
+ continue;
+ }
}
- }
- return 0;
-found:
- h->surplus_huge_pages += delta;
- h->surplus_huge_pages_node[node] += delta;
- return 1;
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[nid] += delta;
+ ret = 1;
+ break;
+ } while (next_nid != start_nid);
+
+ return ret;
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1619,7 +1389,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (h->order >= MAX_ORDER)
return h->max_huge_pages;
/*
@@ -1646,10 +1416,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- if (hstate_is_gigantic(h))
- ret = alloc_fresh_gigantic_page(h, nodes_allowed);
- else
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -1680,7 +1447,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
- cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -1744,12 +1510,12 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct hstate *h;
NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
- err = kstrtoul(buf, 10, &count);
+ err = strict_strtoul(buf, 10, &count);
if (err)
goto out;
h = kobj_to_hstate(kobj, &nid);
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
+ if (h->order >= MAX_ORDER) {
err = -EINVAL;
goto out;
}
@@ -1832,10 +1598,10 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
- if (hstate_is_gigantic(h))
+ if (h->order >= MAX_ORDER)
return -EINVAL;
- err = kstrtoul(buf, 10, &input);
+ err = strict_strtoul(buf, 10, &input);
if (err)
return err;
@@ -1936,7 +1702,8 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+ h->name);
}
}
@@ -1996,7 +1763,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -2040,7 +1807,7 @@ static void hugetlb_unregister_all_nodes(void)
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -2059,8 +1826,9 @@ static void hugetlb_register_node(struct node *node)
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
- h->name, node->dev.id);
+ printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
+ " for node %d\n",
+ h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
}
@@ -2116,15 +1884,16 @@ static void __exit hugetlb_exit(void)
}
kobject_put(hugepages_kobj);
- kfree(htlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
static int __init hugetlb_init(void)
{
- int i;
-
- if (!hugepages_supported())
+ /* Some platform decide whether they support huge pages at boot
+ * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+ * there is no such support
+ */
+ if (HPAGE_SHIFT == 0)
return 0;
if (!size_to_hstate(default_hstate_size)) {
@@ -2144,17 +1913,6 @@ static int __init hugetlb_init(void)
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
-#ifdef CONFIG_SMP
- num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
-#else
- num_fault_mutexes = 1;
-#endif
- htlb_fault_mutex_table =
- kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
- BUG_ON(!htlb_fault_mutex_table);
-
- for (i = 0; i < num_fault_mutexes; i++)
- mutex_init(&htlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -2166,7 +1924,7 @@ void __init hugetlb_add_hstate(unsigned order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warning("hugepagesz= specified twice, ignoring\n");
+ printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -2202,8 +1960,8 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warning("hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ printk(KERN_WARNING "hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
return 1;
}
@@ -2251,12 +2009,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
- if (!hugepages_supported())
- return -ENOTSUPP;
-
tmp = h->max_huge_pages;
- if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (write && h->order >= MAX_ORDER)
return -EINVAL;
table->data = &tmp;
@@ -2299,6 +2054,18 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
}
#endif /* CONFIG_NUMA */
+int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, buffer, length, ppos);
+ if (hugepages_treat_as_movable)
+ htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
+ else
+ htlb_alloc_mask = GFP_HIGHUSER;
+ return 0;
+}
+
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
@@ -2307,12 +2074,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
- if (!hugepages_supported())
- return -ENOTSUPP;
-
tmp = h->nr_overcommit_huge_pages;
- if (write && hstate_is_gigantic(h))
+ if (write && h->order >= MAX_ORDER)
return -EINVAL;
table->data = &tmp;
@@ -2335,8 +2099,6 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
- if (!hugepages_supported())
- return;
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
@@ -2353,8 +2115,6 @@ void hugetlb_report_meminfo(struct seq_file *m)
int hugetlb_report_node_meminfo(int nid, char *buf)
{
struct hstate *h = &default_hstate;
- if (!hugepages_supported())
- return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
@@ -2364,33 +2124,11 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, h->surplus_huge_pages_node[nid]);
}
-void hugetlb_show_meminfo(void)
-{
- struct hstate *h;
- int nid;
-
- if (!hugepages_supported())
- return;
-
- for_each_node_state(nid, N_MEMORY)
- for_each_hstate(h)
- pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
- nid,
- h->nr_huge_pages_node[nid],
- h->free_huge_pages_node[nid],
- h->surplus_huge_pages_node[nid],
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
-}
-
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- struct hstate *h;
- unsigned long nr_total_pages = 0;
-
- for_each_hstate(h)
- nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
- return nr_total_pages;
+ struct hstate *h = &default_hstate;
+ return h->nr_huge_pages * pages_per_huge_page(h);
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2436,7 +2174,7 @@ out:
static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *reservations = vma_resv_map(vma);
/*
* This new VMA should share its siblings reservation map if present.
@@ -2446,30 +2184,41 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- kref_get(&resv->refs);
+ if (reservations)
+ kref_get(&reservations->refs);
+}
+
+static void resv_map_put(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (!reservations)
+ return;
+ kref_put(&reservations->refs, resv_map_release);
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *reservations = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
- unsigned long reserve, start, end;
-
- if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return;
+ unsigned long reserve;
+ unsigned long start;
+ unsigned long end;
- start = vma_hugecache_offset(h, vma, vma->vm_start);
- end = vma_hugecache_offset(h, vma, vma->vm_end);
+ if (reservations) {
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
- reserve = (end - start) - region_count(resv, start, end);
+ reserve = (end - start) -
+ region_count(&reservations->regions, start, end);
- kref_put(&resv->refs, resv_map_release);
+ resv_map_put(vma);
- if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugepage_subpool_put_pages(spool, reserve);
+ }
}
}
@@ -2497,11 +2246,10 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
pte_t entry;
if (writable) {
- entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ entry =
+ pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
- entry = huge_pte_wrprotect(mk_huge_pte(page,
- vma->vm_page_prot));
+ entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
@@ -2515,36 +2263,11 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
+ entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
}
-static int is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
- else
- return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
- else
- return 0;
-}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -2555,68 +2278,65 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- mmun_start = vma->vm_start;
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
-
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
- spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
- break;
- }
+ if (!dst_pte)
+ goto nomem;
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
continue;
- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(h, src, src_pte);
- spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- entry = huge_ptep_get(src_pte);
- if (huge_pte_none(entry)) { /* skip none entry */
- ;
- } else if (unlikely(is_hugetlb_entry_migration(entry) ||
- is_hugetlb_entry_hwpoisoned(entry))) {
- swp_entry_t swp_entry = pte_to_swp_entry(entry);
-
- if (is_write_migration_entry(swp_entry) && cow) {
- /*
- * COW mappings require pages in both
- * parent and child to be set to read.
- */
- make_migration_entry_read(&swp_entry);
- entry = swp_entry_to_pte(swp_entry);
- set_huge_pte_at(src, addr, src_pte, entry);
- }
- set_huge_pte_at(dst, addr, dst_pte, entry);
- } else {
+ spin_lock(&dst->page_table_lock);
+ spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
+ if (!huge_pte_none(huge_ptep_get(src_pte))) {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
+ return 0;
- if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+nomem:
+ return -ENOMEM;
+}
- return ret;
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
}
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2628,7 +2348,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address;
pte_t *ptep;
pte_t pte;
- spinlock_t *ptl;
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -2642,25 +2361,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
again:
+ spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep))
- goto unlock;
+ continue;
pte = huge_ptep_get(ptep);
if (huge_pte_none(pte))
- goto unlock;
+ continue;
/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- huge_pte_clear(mm, address, ptep);
- goto unlock;
+ pte_clear(mm, address, ptep);
+ continue;
}
page = pte_page(pte);
@@ -2671,7 +2390,7 @@ again:
*/
if (ref_page) {
if (page != ref_page)
- goto unlock;
+ continue;
/*
* Mark the VMA as having unmapped its page so that
@@ -2683,23 +2402,18 @@ again:
pte = huge_ptep_get_and_clear(mm, address, ptep);
tlb_remove_tlb_entry(tlb, ptep, address);
- if (huge_pte_dirty(pte))
+ if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush) {
- spin_unlock(ptl);
+ if (force_flush)
break;
- }
/* Bail out after unmapping reference page if supplied */
- if (ref_page) {
- spin_unlock(ptl);
+ if (ref_page)
break;
- }
-unlock:
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
/*
* mmu_gather ran out of room to batch pages, we break out of
* the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2742,7 +2456,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
mm = vma->vm_mm;
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, 0);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2768,7 +2482,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
address = address & huge_page_mask(h);
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
- mapping = file_inode(vma->vm_file)->i_mapping;
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
* Take the mapping lock for the duration of the table walk. As
@@ -2805,10 +2519,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ struct page *pagecache_page)
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
+ int avoidcopy;
int outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -2818,8 +2533,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- page_move_anon_rmap(old_page, vma, address);
+ avoidcopy = (page_mapcount(old_page) == 1);
+ if (avoidcopy) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2833,14 +2550,15 @@ retry_avoidcopy:
* at the time of fork() could consume its reserves on COW instead
* of the full address range.
*/
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ if (!(vma->vm_flags & VM_MAYSHARE) &&
+ is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
old_page != pagecache_page)
outside_reserve = 1;
page_cache_get(old_page);
- /* Drop page table lock as buddy allocator may be called */
- spin_unlock(ptl);
+ /* Drop page_table_lock as buddy allocator may be called */
+ spin_unlock(&mm->page_table_lock);
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
@@ -2858,14 +2576,13 @@ retry_avoidcopy:
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
BUG_ON(huge_pte_none(pte));
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(ptep &&
- pte_same(huge_ptep_get(ptep), pte)))
+ if (likely(pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
- * race occurs while re-acquiring page table
- * lock, and our job is done.
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
*/
return 0;
}
@@ -2873,7 +2590,7 @@ retry_avoidcopy:
}
/* Caller expects lock to be held */
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (err == -ENOMEM)
return VM_FAULT_OOM;
else
@@ -2888,7 +2605,7 @@ retry_avoidcopy:
page_cache_release(new_page);
page_cache_release(old_page);
/* Caller expects lock to be held */
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
}
@@ -2900,14 +2617,12 @@ retry_avoidcopy:
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
- * Retake the page table lock to check for racing updates
+ * Retake the page_table_lock to check for racing updates
* before the page tables are altered
*/
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearPagePrivate(new_page);
-
+ if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
@@ -2917,13 +2632,12 @@ retry_avoidcopy:
/* Make the old page be freed below */
new_page = old_page;
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
page_cache_release(new_page);
page_cache_release(old_page);
-
- /* Caller expects lock to be held */
- spin_lock(ptl);
return 0;
}
@@ -2961,16 +2675,16 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
}
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+ unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
+ pgoff_t idx;
unsigned long size;
struct page *page;
+ struct address_space *mapping;
pte_t new_pte;
- spinlock_t *ptl;
/*
* Currently, we are forced to kill the process in the event the
@@ -2978,11 +2692,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
* COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
- pr_warning("PID %d killed due to inadequate hugepage pool\n",
- current->pid);
+ printk(KERN_WARNING
+ "PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
return ret;
}
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
/*
* Use page lock to guard against racing truncation
* before we get page_table_lock.
@@ -3016,7 +2734,6 @@ retry:
goto retry;
goto out;
}
- ClearPagePrivate(page);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
@@ -3054,8 +2771,7 @@ retry:
goto backout_unlocked;
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -3064,10 +2780,9 @@ retry:
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
- if (anon_rmap) {
- ClearPagePrivate(page);
+ if (anon_rmap)
hugepage_add_new_anon_rmap(page, vma, address);
- } else
+ else
page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -3075,69 +2790,32 @@ retry:
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
unlock_page(page);
out:
return ret;
backout:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
backout_unlocked:
unlock_page(page);
put_page(page);
goto out;
}
-#ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
- pgoff_t idx, unsigned long address)
-{
- unsigned long key[2];
- u32 hash;
-
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
-
- hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
-
- return hash & (num_fault_mutexes - 1);
-}
-#else
-/*
- * For uniprocesor systems we always use a single mutex, so just
- * return 0 and avoid the hashing overhead.
- */
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
- pgoff_t idx, unsigned long address)
-{
- return 0;
-}
-#endif
-
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t *ptep, entry;
- spinlock_t *ptl;
+ pte_t *ptep;
+ pte_t entry;
int ret;
- u32 hash;
- pgoff_t idx;
struct page *page = NULL;
struct page *pagecache_page = NULL;
+ static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
- struct address_space *mapping;
address &= huge_page_mask(h);
@@ -3145,7 +2823,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -3156,20 +2834,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!ptep)
return VM_FAULT_OOM;
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&htlb_fault_mutex_table[hash]);
-
+ mutex_lock(&hugetlb_instantiation_mutex);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+ ret = hugetlb_no_page(mm, vma, address, ptep, flags);
goto out_mutex;
}
@@ -3183,7 +2856,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* page now as it is used to determine if a reservation has been
* consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -3206,28 +2879,27 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (page != pagecache_page)
lock_page(page);
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
- goto out_ptl;
+ goto out_page_table_lock;
if (flags & FAULT_FLAG_WRITE) {
- if (!huge_pte_write(entry)) {
+ if (!pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
- goto out_ptl;
+ pagecache_page);
+ goto out_page_table_lock;
}
- entry = huge_pte_mkdirty(entry);
+ entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, address, ptep);
-out_ptl:
- spin_unlock(ptl);
+out_page_table_lock:
+ spin_unlock(&mm->page_table_lock);
if (pagecache_page) {
unlock_page(pagecache_page);
@@ -3238,23 +2910,33 @@ out_ptl:
put_page(page);
out_mutex:
- mutex_unlock(&htlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_instantiation_mutex);
+
return ret;
}
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags)
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, int *length, int i,
+ unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
- unsigned long remainder = *nr_pages;
+ int remainder = *length;
struct hstate *h = hstate_vma(vma);
+ spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
- spinlock_t *ptl = NULL;
int absent;
struct page *page;
@@ -3262,12 +2944,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
/*
@@ -3279,31 +2957,18 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (absent && (flags & FOLL_DUMP) &&
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
remainder = 0;
break;
}
- /*
- * We need call hugetlb_fault for both hugepages under migration
- * (in which case hugetlb_fault waits for the migration,) and
- * hwpoisoned hugepages (in which case we need to prevent the
- * caller from accessing to them.) In order to do this, we use
- * here is_swap_pte instead of is_hugetlb_entry_migration and
- * is_hugetlb_entry_hwpoisoned. This is because it simply covers
- * both cases, and because we can't follow correct pages
- * directly from any kind of swap entries.
- */
- if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) &&
- !huge_pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
- if (pte)
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
ret = hugetlb_fault(mm, vma, vaddr,
(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
+ spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;
@@ -3316,7 +2981,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
@@ -3334,9 +2999,9 @@ same_page:
*/
goto same_page;
}
- spin_unlock(ptl);
}
- *nr_pages = remainder;
+ spin_unlock(&mm->page_table_lock);
+ *length = remainder;
*position = vaddr;
return i ? i : -EFAULT;
@@ -3355,28 +3020,24 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
- spinlock_t *ptl;
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
- spin_unlock(ptl);
continue;
}
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
pages++;
}
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
/*
* Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
@@ -3385,7 +3046,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
- mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
}
@@ -3398,7 +3058,6 @@ int hugetlb_reserve_pages(struct inode *inode,
long ret, chg;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
- struct resv_map *resv_map;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3414,13 +3073,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE) {
- resv_map = inode_resv_map(inode);
-
- chg = region_chg(resv_map, from, to);
-
- } else {
- resv_map = resv_map_alloc();
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ chg = region_chg(&inode->i_mapping->private_list, from, to);
+ else {
+ struct resv_map *resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
@@ -3463,23 +3119,20 @@ int hugetlb_reserve_pages(struct inode *inode,
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_add(resv_map, from, to);
+ region_add(&inode->i_mapping->private_list, from, to);
return 0;
out_err:
- if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- kref_put(&resv_map->refs, resv_map_release);
+ if (vma)
+ resv_map_put(vma);
return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
- struct resv_map *resv_map = inode_resv_map(inode);
- long chg = 0;
+ long chg = region_truncate(&inode->i_mapping->private_list, offset);
struct hugepage_subpool *spool = subpool_inode(inode);
- if (resv_map)
- chg = region_truncate(resv_map, offset);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3488,218 +3141,6 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-static unsigned long page_table_shareable(struct vm_area_struct *svma,
- struct vm_area_struct *vma,
- unsigned long addr, pgoff_t idx)
-{
- unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
- svma->vm_start;
- unsigned long sbase = saddr & PUD_MASK;
- unsigned long s_end = sbase + PUD_SIZE;
-
- /* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
-
- /*
- * match the virtual addresses, permission and the alignment of the
- * page table page.
- */
- if (pmd_index(addr) != pmd_index(saddr) ||
- vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
- return 0;
-
- return saddr;
-}
-
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE &&
- vma->vm_start <= base && end <= vma->vm_end)
- return 1;
- return 0;
-}
-
-/*
- * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
- * and returns the corresponding pte. While this is not necessary for the
- * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_mutex section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
- */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
- struct vm_area_struct *vma = find_vma(mm, addr);
- struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- struct vm_area_struct *svma;
- unsigned long saddr;
- pte_t *spte = NULL;
- pte_t *pte;
- spinlock_t *ptl;
-
- if (!vma_shareable(vma, addr))
- return (pte_t *)pmd_alloc(mm, pud, addr);
-
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
- if (svma == vma)
- continue;
-
- saddr = page_table_shareable(svma, vma, addr, idx);
- if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr);
- if (spte) {
- get_page(virt_to_page(spte));
- break;
- }
- }
- }
-
- if (!spte)
- goto out;
-
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
- if (pud_none(*pud))
- pud_populate(mm, pud,
- (pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
- put_page(virt_to_page(spte));
- spin_unlock(ptl);
-out:
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
- mutex_unlock(&mapping->i_mmap_mutex);
- return pte;
-}
-
-/*
- * unmap huge page backed by shared pte.
- *
- * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
- * indicated by page_count > 1, unmap is achieved by clearing pud and
- * decrementing the ref count. If count == 1, the pte page is not shared.
- *
- * called with page table lock held.
- *
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
- */
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- pgd_t *pgd = pgd_offset(mm, *addr);
- pud_t *pud = pud_offset(pgd, *addr);
-
- BUG_ON(page_count(virt_to_page(ptep)) == 0);
- if (page_count(virt_to_page(ptep)) == 1)
- return 0;
-
- pud_clear(pud);
- put_page(virt_to_page(ptep));
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
- return 1;
-}
-#define want_pmd_share() (1)
-#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
-{
- return NULL;
-}
-#define want_pmd_share() (0)
-#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-
-#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
- unsigned long addr, unsigned long sz)
-{
- pgd_t *pgd;
- pud_t *pud;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- pud = pud_alloc(mm, pgd, addr);
- if (pud) {
- if (sz == PUD_SIZE) {
- pte = (pte_t *)pud;
- } else {
- BUG_ON(sz != PMD_SIZE);
- if (want_pmd_share() && pud_none(*pud))
- pte = huge_pmd_share(mm, addr, pud);
- else
- pte = (pte_t *)pmd_alloc(mm, pud, addr);
- }
- }
- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
-
- return pte;
-}
-
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (pud_present(*pud)) {
- if (pud_huge(*pud))
- return (pte_t *)pud;
- pmd = pmd_offset(pud, addr);
- }
- }
- return (pte_t *) pmd;
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
- return page;
-}
-
-#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
-/* Can be overriden by architectures */
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
-{
- BUG();
- return NULL;
-}
-
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
@@ -3744,45 +3185,3 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
return ret;
}
#endif
-
-bool isolate_huge_page(struct page *page, struct list_head *list)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- if (!get_page_unless_zero(page))
- return false;
- spin_lock(&hugetlb_lock);
- list_move_tail(&page->lru, list);
- spin_unlock(&hugetlb_lock);
- return true;
-}
-
-void putback_active_hugepage(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- spin_lock(&hugetlb_lock);
- list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
- spin_unlock(&hugetlb_lock);
- put_page(page);
-}
-
-bool is_hugepage_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- /*
- * This function can be called for a tail page because the caller,
- * scan_movable_pages, scans through a given pfn-range which typically
- * covers one memory block. In systems using gigantic hugepage (1GB
- * for x86_64,) a hugepage is larger than a memory block, and we don't
- * support migrating such large hugepages for now, so return false
- * when called for tail pages.
- */
- if (PageTail(page))
- return false;
- /*
- * Refcount of a hwpoisoned hugepages is 1, but they are not active,
- * so we should return false for them.
- */
- if (unlikely(PageHWPoison(page)))
- return false;
- return page_count(page) > 0;
-}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 493f758445e..9cea7de22ff 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,18 +30,27 @@ struct hugetlb_cgroup {
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+struct cgroup_subsys hugetlb_subsys __read_mostly;
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
- return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
+ return container_of(s, struct hugetlb_cgroup, css);
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+ hugetlb_subsys_id));
}
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
- return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
+ return hugetlb_cgroup_from_css(task_subsys_state(task,
+ hugetlb_subsys_id));
}
static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -49,15 +58,17 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
return (h_cg == root_h_cgroup);
}
-static inline struct hugetlb_cgroup *
-parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
{
- return hugetlb_cgroup_from_css(h_cg->css.parent);
+ if (!cg->parent)
+ return NULL;
+ return hugetlb_cgroup_from_cgroup(cg->parent);
}
-static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
{
int idx;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
@@ -66,18 +77,19 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
return false;
}
-static struct cgroup_subsys_state *
-hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
{
- struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
- struct hugetlb_cgroup *h_cgroup;
int idx;
+ struct cgroup *parent_cgroup;
+ struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
- if (parent_h_cgroup) {
+ parent_cgroup = cgroup->parent;
+ if (parent_cgroup) {
+ parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
res_counter_init(&h_cgroup->hugepage[idx],
&parent_h_cgroup->hugepage[idx]);
@@ -89,11 +101,11 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &h_cgroup->css;
}
-static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
{
struct hugetlb_cgroup *h_cgroup;
- h_cgroup = hugetlb_cgroup_from_css(css);
+ h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
kfree(h_cgroup);
}
@@ -105,14 +117,15 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
* page reference and test for page active here. This function
* cannot fail.
*/
-static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
struct page *page)
{
int csize;
struct res_counter *counter;
struct res_counter *fail_res;
struct hugetlb_cgroup *page_hcg;
- struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+ struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
page_hcg = hugetlb_cgroup_from_page(page);
/*
@@ -142,9 +155,8 @@ out:
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
-static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
{
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
int idx = 0;
@@ -153,13 +165,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(idx, h_cg, page);
+ hugetlb_cgroup_move_parent(idx, cgroup, page);
spin_unlock(&hugetlb_lock);
idx++;
}
cond_resched();
- } while (hugetlb_cgroup_have_usage(h_cg));
+ } while (hugetlb_cgroup_have_usage(cgroup));
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -181,7 +193,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
@@ -241,28 +253,32 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
return;
}
-static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- int idx, name;
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ u64 val;
+ char str[64];
+ int idx, name, len;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
idx = MEMFILE_IDX(cft->private);
name = MEMFILE_ATTR(cft->private);
- return res_counter_read_u64(&h_cg->hugepage[idx], name);
+ val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+ len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+ return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
-static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+ const char *buffer)
{
int idx, name, ret;
unsigned long long val;
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
- buf = strstrip(buf);
- idx = MEMFILE_IDX(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
switch (name) {
case RES_LIMIT:
@@ -272,7 +288,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
break;
}
/* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buf, &val);
+ ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
@@ -281,17 +297,16 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
ret = -EINVAL;
break;
}
- return ret ?: nbytes;
+ return ret;
}
-static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
{
int idx, name, ret = 0;
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
- idx = MEMFILE_IDX(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ idx = MEMFILE_IDX(event);
+ name = MEMFILE_ATTR(event);
switch (name) {
case RES_MAX_USAGE:
@@ -304,7 +319,7 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
ret = -EINVAL;
break;
}
- return ret ?: nbytes;
+ return ret;
}
static char *mem_fmt(char *buf, int size, unsigned long hsize)
@@ -331,34 +346,34 @@ static void __init __hugetlb_cgroup_file_init(int idx)
cft = &h->cgroup_files[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
- cft->read_u64 = hugetlb_cgroup_read_u64;
- cft->write = hugetlb_cgroup_write;
+ cft->read = hugetlb_cgroup_read;
+ cft->write_string = hugetlb_cgroup_write;
/* Add the usage file */
cft = &h->cgroup_files[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
- cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->read = hugetlb_cgroup_read;
/* Add the MAX usage file */
cft = &h->cgroup_files[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
- cft->write = hugetlb_cgroup_reset;
- cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
/* Add the failcntfile */
cft = &h->cgroup_files[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
- cft->write = hugetlb_cgroup_reset;
- cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
/* NULL terminate the last cft */
cft = &h->cgroup_files[4];
memset(cft, 0, sizeof(*cft));
- WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
return;
}
@@ -390,7 +405,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
+ VM_BUG_ON(!PageHuge(oldhpage));
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
@@ -402,8 +417,10 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
return;
}
-struct cgroup_subsys hugetlb_cgrp_subsys = {
+struct cgroup_subsys hugetlb_subsys = {
+ .name = "hugetlb",
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
+ .subsys_id = hugetlb_subsys_id,
};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 95487c71cad..3a61efc518d 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -20,6 +20,8 @@ static int hwpoison_inject(void *data, u64 val)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!hwpoison_filter_enable)
+ goto inject;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -31,9 +33,6 @@ static int hwpoison_inject(void *data, u64 val)
if (!get_page_unless_zero(hpage))
return 0;
- if (!hwpoison_filter_enable)
- goto inject;
-
if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
/*
@@ -55,7 +54,7 @@ static int hwpoison_inject(void *data, u64 val)
return 0;
inject:
- pr_info("Injecting memory failure at pfn %#lx\n", pfn);
+ printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
}
@@ -89,12 +88,12 @@ static int pfn_inject_init(void)
* hardware status change, hence do not require hardware support.
* They are mainly for testing hwpoison in software level.
*/
- dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
+ dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
NULL, &hwpoison_fops);
if (!dentry)
goto fail;
- dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
+ dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
NULL, &unpoison_fops);
if (!dentry)
goto fail;
diff --git a/mm/internal.h b/mm/internal.h
index 7f22a11fcc6..9ba21100ebf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,7 +11,6 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
-#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -22,31 +21,22 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
-extern int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
- unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- return __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-}
-
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+ VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(atomic_read(&page->_count));
set_page_count(page, 1);
}
+static inline void __put_page(struct page *page)
+{
+ atomic_dec(&page->_count);
+}
+
static inline void __get_page_tail_foll(struct page *page,
bool get_page_head)
{
@@ -61,10 +51,12 @@ static inline void __get_page_tail_foll(struct page *page,
* speculative page access (like in
* page_cache_get_speculative()) on tail pages.
*/
- VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
+ VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ VM_BUG_ON(page_mapcount(page) < 0);
if (get_page_head)
atomic_inc(&page->first_page->_count);
- get_huge_page_tail(page);
+ atomic_inc(&page->_mapcount);
}
/*
@@ -86,7 +78,7 @@ static inline void get_page_foll(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
atomic_inc(&page->_count);
}
}
@@ -98,7 +90,6 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern bool zone_reclaimable(struct zone *zone);
/*
* in mm/rmap.c:
@@ -113,7 +104,6 @@ extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
-extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -134,7 +124,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- enum migrate_mode mode; /* Async or sync migration mode */
+ bool sync; /* Synchronous migration */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
@@ -144,10 +134,7 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool contended; /* True if a lock was contended, or
- * need_resched() true during async
- * compaction
- */
+ bool contended; /* True if a lock was contended */
};
unsigned long
@@ -160,11 +147,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif
/*
- * This function returns the order of a free page in the buddy system. In
- * general, page_zone(page)->lock must be held by the caller to prevent the
- * page from being allocated in parallel and returning garbage as the order.
- * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
*/
static inline unsigned long page_order(struct page *page)
{
@@ -172,18 +157,13 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
-
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
#ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking);
+extern long mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
@@ -192,10 +172,30 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma. If it is, mark page as mlocked.
+ */
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+ struct page *page)
+{
+ VM_BUG_ON(PageLRU(page));
+
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+ return 0;
+
+ if (!TestSetPageMlocked(page)) {
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ return 1;
+}
+
+/*
* must be called with vma's mmap_sem held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
-extern unsigned int munlock_vma_page(struct page *page);
+extern void munlock_vma_page(struct page *page);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
@@ -233,6 +233,10 @@ extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
#endif
#else /* !CONFIG_MMU */
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
+{
+ return 0;
+}
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
@@ -369,6 +373,5 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
-#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
deleted file mode 100644
index 7b5dbd1517b..00000000000
--- a/mm/iov_iter.c
+++ /dev/null
@@ -1,743 +0,0 @@
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
- void *kaddr, *from;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- if (!fault_in_pages_writeable(buf, copy)) {
- kaddr = kmap_atomic(page);
- from = kaddr + offset;
-
- /* first chunk, usually the only one */
- left = __copy_to_user_inatomic(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user_inatomic(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- if (likely(!bytes)) {
- kunmap_atomic(kaddr);
- goto done;
- }
- offset = from - kaddr;
- buf += copy;
- kunmap_atomic(kaddr);
- copy = min(bytes, iov->iov_len - skip);
- }
- /* Too bad - revert to non-atomic kmap */
- kaddr = kmap(page);
- from = kaddr + offset;
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- kunmap(page);
-done:
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
- void *kaddr, *to;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- if (!fault_in_pages_readable(buf, copy)) {
- kaddr = kmap_atomic(page);
- to = kaddr + offset;
-
- /* first chunk, usually the only one */
- left = __copy_from_user_inatomic(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user_inatomic(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- if (likely(!bytes)) {
- kunmap_atomic(kaddr);
- goto done;
- }
- offset = to - kaddr;
- buf += copy;
- kunmap_atomic(kaddr);
- copy = min(bytes, iov->iov_len - skip);
- }
- /* Too bad - revert to non-atomic kmap */
- kaddr = kmap(page);
- to = kaddr + offset;
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- kunmap(page);
-done:
- if (skip == iov->iov_len) {
- iov++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static size_t copy_from_user_atomic_iovec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-
-static void advance_iovec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
- if (!(i->type & ITER_BVEC)) {
- char __user *buf = i->iov->iov_base + i->iov_offset;
- bytes = min(bytes, i->iov->iov_len - i->iov_offset);
- return fault_in_pages_readable(buf, bytes);
- }
- return 0;
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-static unsigned long alignment_iovec(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = (unsigned long)iov->iov_base + i->iov_offset;
- n = iov->iov_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++iov)->iov_len) {
- res |= (unsigned long)iov->iov_base | iov->iov_len;
- size -= iov->iov_len;
- }
- res |= (unsigned long)iov->iov_base | size;
- return res;
-}
-
-void iov_iter_init(struct iov_iter *i, int direction,
- const struct iovec *iov, unsigned long nr_segs,
- size_t count)
-{
- /* It will get better. Eventually... */
- if (segment_eq(get_fs(), KERNEL_DS))
- direction |= ITER_KVEC;
- i->type = direction;
- i->iov = iov;
- i->nr_segs = nr_segs;
- i->iov_offset = 0;
- i->count = count;
-}
-EXPORT_SYMBOL(iov_iter_init);
-
-static ssize_t get_pages_iovec(struct iov_iter *i,
- struct page **pages, size_t maxsize,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
- if (unlikely(res < 0))
- return res;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- size_t offset = i->iov_offset;
- const struct iovec *iov = i->iov;
- size_t len;
- unsigned long addr;
- void *p;
- int n;
- int res;
-
- len = iov->iov_len - offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- addr = (unsigned long)iov->iov_base + offset;
- len += *start = addr & (PAGE_SIZE - 1);
- addr &= ~(PAGE_SIZE - 1);
- n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
-
- p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
- if (!p)
- p = vmalloc(n * sizeof(struct page *));
- if (!p)
- return -ENOMEM;
-
- res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
- if (unlikely(res < 0)) {
- kvfree(p);
- return res;
- }
- *pages = p;
- return (res == n ? len : res * PAGE_SIZE) - *start;
-}
-
-static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct iovec *iov = i->iov;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, iov++) {
- unsigned long addr = (unsigned long)iov->iov_base + offset;
- size_t len = iov->iov_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
- - addr / PAGE_SIZE;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
-
-static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
-{
- char *from = kmap_atomic(page);
- memcpy(to, from + offset, len);
- kunmap_atomic(from);
-}
-
-static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
-{
- char *to = kmap_atomic(page);
- memcpy(to + offset, from, len);
- kunmap_atomic(to);
-}
-
-static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
- void *kaddr, *from;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
- copy = min_t(size_t, bytes, bvec->bv_len - skip);
-
- kaddr = kmap_atomic(page);
- from = kaddr + offset;
- memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
- skip += copy;
- from += copy;
- bytes -= copy;
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- kunmap_atomic(kaddr);
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted - bytes;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-
-static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, wanted;
- const struct bio_vec *bvec;
- void *kaddr, *to;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- bvec = i->bvec;
- skip = i->iov_offset;
-
- kaddr = kmap_atomic(page);
-
- to = kaddr + offset;
-
- copy = min(bytes, bvec->bv_len - skip);
-
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
-
- to += copy;
- skip += copy;
- bytes -= copy;
-
- while (bytes) {
- bvec++;
- copy = min(bytes, (size_t)bvec->bv_len);
- memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- kunmap_atomic(kaddr);
- if (skip == bvec->bv_len) {
- bvec++;
- skip = 0;
- }
- i->count -= wanted;
- i->nr_segs -= bvec - i->bvec;
- i->bvec = bvec;
- i->iov_offset = skip;
- return wanted;
-}
-
-static size_t copy_from_user_bvec(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t left;
- const struct bio_vec *bvec;
- size_t base = i->iov_offset;
-
- kaddr = kmap_atomic(page);
- for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
- size_t copy = min(left, bvec->bv_len - base);
- if (!bvec->bv_len)
- continue;
- memcpy_from_page(kaddr + offset, bvec->bv_page,
- bvec->bv_offset + base, copy);
- offset += copy;
- left -= copy;
- }
- kunmap_atomic(kaddr);
- return bytes;
-}
-
-static void advance_bvec(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct bio_vec *bvec = i->bvec;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !bvec->bv_len)) {
- int copy;
-
- copy = min(bytes, bvec->bv_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (bvec->bv_len == base) {
- bvec++;
- nr_segs--;
- base = 0;
- }
- }
- i->bvec = bvec;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-
-static unsigned long alignment_bvec(const struct iov_iter *i)
-{
- const struct bio_vec *bvec = i->bvec;
- unsigned long res;
- size_t size = i->count;
- size_t n;
-
- if (!size)
- return 0;
-
- res = bvec->bv_offset + i->iov_offset;
- n = bvec->bv_len - i->iov_offset;
- if (n >= size)
- return res | size;
- size -= n;
- res |= n;
- while (size > (++bvec)->bv_len) {
- res |= bvec->bv_offset | bvec->bv_len;
- size -= bvec->bv_len;
- }
- res |= bvec->bv_offset | size;
- return res;
-}
-
-static ssize_t get_pages_bvec(struct iov_iter *i,
- struct page **pages, size_t maxsize,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- *start = bvec->bv_offset + i->iov_offset;
-
- get_page(*pages = bvec->bv_page);
-
- return len;
-}
-
-static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- const struct bio_vec *bvec = i->bvec;
- size_t len = bvec->bv_len - i->iov_offset;
- if (len > i->count)
- len = i->count;
- if (len > maxsize)
- len = maxsize;
- *start = bvec->bv_offset + i->iov_offset;
-
- *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
- if (!*pages)
- return -ENOMEM;
-
- get_page(**pages = bvec->bv_page);
-
- return len;
-}
-
-static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
-{
- size_t offset = i->iov_offset;
- size_t size = i->count;
- const struct bio_vec *bvec = i->bvec;
- int npages = 0;
- int n;
-
- for (n = 0; size && n < i->nr_segs; n++, bvec++) {
- size_t len = bvec->bv_len - offset;
- offset = 0;
- if (unlikely(!len)) /* empty segment */
- continue;
- if (len > size)
- len = size;
- npages++;
- if (npages >= maxpages) /* don't bother going further */
- return maxpages;
- size -= len;
- offset = 0;
- }
- return min(npages, maxpages);
-}
-
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- if (i->type & ITER_BVEC)
- return copy_page_to_iter_bvec(page, offset, bytes, i);
- else
- return copy_page_to_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- if (i->type & ITER_BVEC)
- return copy_page_from_iter_bvec(page, offset, bytes, i);
- else
- return copy_page_from_iter_iovec(page, offset, bytes, i);
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
-size_t iov_iter_copy_from_user_atomic(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- if (i->type & ITER_BVEC)
- return copy_from_user_bvec(page, i, offset, bytes);
- else
- return copy_from_user_atomic_iovec(page, i, offset, bytes);
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-void iov_iter_advance(struct iov_iter *i, size_t size)
-{
- if (i->type & ITER_BVEC)
- advance_bvec(i, size);
- else
- advance_iovec(i, size);
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
- if (i->nr_segs == 1)
- return i->count;
- else if (i->type & ITER_BVEC)
- return min(i->count, i->iov->iov_len - i->iov_offset);
- else
- return min(i->count, i->bvec->bv_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
-unsigned long iov_iter_alignment(const struct iov_iter *i)
-{
- if (i->type & ITER_BVEC)
- return alignment_bvec(i);
- else
- return alignment_iovec(i);
-}
-EXPORT_SYMBOL(iov_iter_alignment);
-
-ssize_t iov_iter_get_pages(struct iov_iter *i,
- struct page **pages, size_t maxsize,
- size_t *start)
-{
- if (i->type & ITER_BVEC)
- return get_pages_bvec(i, pages, maxsize, start);
- else
- return get_pages_iovec(i, pages, maxsize, start);
-}
-EXPORT_SYMBOL(iov_iter_get_pages);
-
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
- struct page ***pages, size_t maxsize,
- size_t *start)
-{
- if (i->type & ITER_BVEC)
- return get_pages_alloc_bvec(i, pages, maxsize, start);
- else
- return get_pages_alloc_iovec(i, pages, maxsize, start);
-}
-EXPORT_SYMBOL(iov_iter_get_pages_alloc);
-
-int iov_iter_npages(const struct iov_iter *i, int maxpages)
-{
- if (i->type & ITER_BVEC)
- return iov_iter_npages_bvec(i, maxpages);
- else
- return iov_iter_npages_iovec(i, maxpages);
-}
-EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dcdcadb6953..ff0d9779cec 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -18,8 +18,6 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#define pr_fmt(fmt) "kmemleak: " fmt
-
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -52,25 +50,25 @@ static int __init kmemleak_test_init(void)
printk(KERN_INFO "Kmemleak testing\n");
/* make some orphan objects */
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
#ifndef CONFIG_MODULES
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
+ pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
+ pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
#endif
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
/*
* Add elements to a list. They should only appear as orphan
@@ -78,7 +76,7 @@ static int __init kmemleak_test_init(void)
*/
for (i = 0; i < 10; i++) {
elem = kzalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
+ pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
if (!elem)
return -ENOMEM;
INIT_LIST_HEAD(&elem->list);
@@ -87,7 +85,7 @@ static int __init kmemleak_test_init(void)
for_each_possible_cpu(i) {
per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
- pr_info("kmalloc(129) = %p\n",
+ pr_info("kmemleak: kmalloc(129) = %p\n",
per_cpu(kmemleak_test_pointer, i));
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3cda50c1e39..752a705c77c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -192,15 +192,15 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
/* set in the late_initcall if there were no errors */
-static int kmemleak_initialized;
+static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
+static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
/* set if a kmemleak warning was issued */
-static int kmemleak_warning;
+static atomic_t kmemleak_warning = ATOMIC_INIT(0);
/* set if a fatal kmemleak error has occurred */
-static int kmemleak_error;
+static atomic_t kmemleak_error = ATOMIC_INIT(0);
/* minimum and maximum address that may be valid pointers */
static unsigned long min_addr = ULONG_MAX;
@@ -218,8 +218,7 @@ static int kmemleak_stack_scan = 1;
static DEFINE_MUTEX(scan_mutex);
/* setting kmemleak=on, will set this var, skipping the disable */
static int kmemleak_skip_disable;
-/* If there are leaks that can be reported */
-static bool kmemleak_found_leaks;
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -268,7 +267,7 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warning(x); \
dump_stack(); \
- kmemleak_warning = 1; \
+ atomic_set(&kmemleak_warning, 1); \
} while (0)
/*
@@ -387,7 +386,7 @@ static void dump_object_info(struct kmemleak_object *object)
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%lx\n", object->flags);
- pr_notice(" checksum = %u\n", object->checksum);
+ pr_notice(" checksum = %d\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
}
@@ -437,7 +436,7 @@ static int get_object(struct kmemleak_object *object)
*/
static void free_object_rcu(struct rcu_head *rcu)
{
- struct hlist_node *tmp;
+ struct hlist_node *elem, *tmp;
struct kmemleak_scan_area *area;
struct kmemleak_object *object =
container_of(rcu, struct kmemleak_object, rcu);
@@ -446,8 +445,8 @@ static void free_object_rcu(struct rcu_head *rcu)
* Once use_count is 0 (guaranteed by put_object), there is no other
* code accessing this object, hence no need for locking.
*/
- hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
- hlist_del(&area->node);
+ hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
+ hlist_del(elem);
kmem_cache_free(scan_area_cache, area);
}
kmem_cache_free(object_cache, object);
@@ -754,9 +753,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
- if (size == SIZE_MAX) {
- size = object->pointer + object->size - ptr;
- } else if (ptr + size > object->pointer + object->size) {
+ if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
@@ -806,7 +803,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
unsigned long flags;
struct early_log *log;
- if (kmemleak_error) {
+ if (atomic_read(&kmemleak_error)) {
/* kmemleak stopped recording, just count the requests */
crt_early_log++;
return;
@@ -841,7 +838,7 @@ static void early_alloc(struct early_log *log)
unsigned long flags;
int i;
- if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
+ if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
return;
/*
@@ -894,9 +891,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
{
pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -920,11 +917,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
* Percpu allocations are only scanned and not reported as leaks
* (min_count is set to 0).
*/
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, GFP_KERNEL);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -940,9 +937,9 @@ void __ref kmemleak_free(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -960,9 +957,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -980,50 +977,16 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
/**
- * kmemleak_update_trace - update object allocation stack trace
- * @ptr: pointer to beginning of the object
- *
- * Override the object allocation stack trace for cases where the actual
- * allocation place is not always useful.
- */
-void __ref kmemleak_update_trace(const void *ptr)
-{
- struct kmemleak_object *object;
- unsigned long flags;
-
- pr_debug("%s(0x%p)\n", __func__, ptr);
-
- if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
- return;
-
- object = find_and_get_object((unsigned long)ptr, 1);
- if (!object) {
-#ifdef DEBUG
- kmemleak_warn("Updating stack trace for unknown object at %p\n",
- ptr);
-#endif
- return;
- }
-
- spin_lock_irqsave(&object->lock, flags);
- object->trace_len = __save_stack_trace(object->trace);
- spin_unlock_irqrestore(&object->lock, flags);
-
- put_object(object);
-}
-EXPORT_SYMBOL(kmemleak_update_trace);
-
-/**
* kmemleak_not_leak - mark an allocated object as false positive
* @ptr: pointer to beginning of the object
*
@@ -1034,9 +997,9 @@ void __ref kmemleak_not_leak(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -1054,9 +1017,9 @@ void __ref kmemleak_ignore(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1076,9 +1039,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -1096,9 +1059,9 @@ void __ref kmemleak_no_scan(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (kmemleak_early_log)
+ else if (atomic_read(&kmemleak_early_log))
log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
@@ -1123,7 +1086,7 @@ static bool update_checksum(struct kmemleak_object *object)
*/
static int scan_should_stop(void)
{
- if (!kmemleak_enabled)
+ if (!atomic_read(&kmemleak_enabled))
return 1;
/*
@@ -1214,6 +1177,7 @@ static void scan_block(void *_start, void *_end,
static void scan_object(struct kmemleak_object *object)
{
struct kmemleak_scan_area *area;
+ struct hlist_node *elem;
unsigned long flags;
/*
@@ -1241,7 +1205,7 @@ static void scan_object(struct kmemleak_object *object)
spin_lock_irqsave(&object->lock, flags);
}
} else
- hlist_for_each_entry(area, &object->area_list, node)
+ hlist_for_each_entry(area, elem, &object->area_list, node)
scan_block((void *)area->start,
(void *)(area->start + area->size),
object, 0);
@@ -1334,10 +1298,11 @@ static void kmemleak_scan(void)
/*
* Struct page scanning for each node.
*/
- get_online_mems();
+ lock_memory_hotplug();
for_each_online_node(i) {
- unsigned long start_pfn = node_start_pfn(i);
- unsigned long end_pfn = node_end_pfn(i);
+ pg_data_t *pgdat = NODE_DATA(i);
+ unsigned long start_pfn = pgdat->node_start_pfn;
+ unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -1352,7 +1317,7 @@ static void kmemleak_scan(void)
scan_block(page, page + 1, NULL, 1);
}
}
- put_online_mems();
+ unlock_memory_hotplug();
/*
* Scanning the task stacks (may introduce false negatives).
@@ -1417,12 +1382,9 @@ static void kmemleak_scan(void)
}
rcu_read_unlock();
- if (new_leaks) {
- kmemleak_found_leaks = true;
-
+ if (new_leaks)
pr_info("%d new suspected memory leaks (see "
"/sys/kernel/debug/kmemleak)\n", new_leaks);
- }
}
@@ -1583,6 +1545,11 @@ static int kmemleak_open(struct inode *inode, struct file *file)
return seq_open(file, &kmemleak_seq_ops);
}
+static int kmemleak_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+
static int dump_str_object_info(const char *str)
{
unsigned long flags;
@@ -1625,12 +1592,8 @@ static void kmemleak_clear(void)
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
-
- kmemleak_found_leaks = false;
}
-static void __kmemleak_do_cleanup(void);
-
/*
* File write operation to configure kmemleak at run-time. The following
* commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1643,8 +1606,7 @@ static void __kmemleak_do_cleanup(void);
* disable it)
* scan - trigger a memory scan
* clear - mark all current reported unreferenced kmemleak objects as
- * grey to ignore printing them, or free all kmemleak objects
- * if kmemleak has been disabled.
+ * grey to ignore printing them
* dump=... - dump information about the object found at the given address
*/
static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
@@ -1654,6 +1616,9 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
int buf_size;
int ret;
+ if (!atomic_read(&kmemleak_enabled))
+ return -EBUSY;
+
buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
@@ -1663,19 +1628,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
if (ret < 0)
return ret;
- if (strncmp(buf, "clear", 5) == 0) {
- if (kmemleak_enabled)
- kmemleak_clear();
- else
- __kmemleak_do_cleanup();
- goto out;
- }
-
- if (!kmemleak_enabled) {
- ret = -EBUSY;
- goto out;
- }
-
if (strncmp(buf, "off", 3) == 0)
kmemleak_disable();
else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1689,7 +1641,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
else if (strncmp(buf, "scan=", 5) == 0) {
unsigned long secs;
- ret = kstrtoul(buf + 5, 0, &secs);
+ ret = strict_strtoul(buf + 5, 0, &secs);
if (ret < 0)
goto out;
stop_scan_thread();
@@ -1699,6 +1651,8 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
}
} else if (strncmp(buf, "scan", 4) == 0)
kmemleak_scan();
+ else if (strncmp(buf, "clear", 5) == 0)
+ kmemleak_clear();
else if (strncmp(buf, "dump=", 5) == 0)
ret = dump_str_object_info(buf + 5);
else
@@ -1720,19 +1674,9 @@ static const struct file_operations kmemleak_fops = {
.read = seq_read,
.write = kmemleak_write,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = kmemleak_release,
};
-static void __kmemleak_do_cleanup(void)
-{
- struct kmemleak_object *object;
-
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
-}
-
/*
* Stop the memory scanning thread and free the kmemleak internal objects if
* no previous scan thread (otherwise, kmemleak may still have some useful
@@ -1740,14 +1684,18 @@ static void __kmemleak_do_cleanup(void)
*/
static void kmemleak_do_cleanup(struct work_struct *work)
{
+ struct kmemleak_object *object;
+ bool cleanup = scan_thread == NULL;
+
mutex_lock(&scan_mutex);
stop_scan_thread();
- if (!kmemleak_found_leaks)
- __kmemleak_do_cleanup();
- else
- pr_info("Kmemleak disabled without freeing internal data. "
- "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ if (cleanup) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list)
+ delete_object_full(object->pointer);
+ rcu_read_unlock();
+ }
mutex_unlock(&scan_mutex);
}
@@ -1760,14 +1708,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
static void kmemleak_disable(void)
{
/* atomically check whether it was already invoked */
- if (cmpxchg(&kmemleak_error, 0, 1))
+ if (atomic_cmpxchg(&kmemleak_error, 0, 1))
return;
/* stop any memory operation tracing */
- kmemleak_enabled = 0;
+ atomic_set(&kmemleak_enabled, 0);
/* check whether it is too early for a kernel thread */
- if (kmemleak_initialized)
+ if (atomic_read(&kmemleak_initialized))
schedule_work(&cleanup_work);
pr_info("Kernel memory leak detector disabled\n");
@@ -1811,7 +1759,7 @@ void __init kmemleak_init(void)
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
- kmemleak_early_log = 0;
+ atomic_set(&kmemleak_early_log, 0);
kmemleak_disable();
return;
}
@@ -1829,12 +1777,12 @@ void __init kmemleak_init(void)
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
- kmemleak_early_log = 0;
- if (kmemleak_error) {
+ atomic_set(&kmemleak_early_log, 0);
+ if (atomic_read(&kmemleak_error)) {
local_irq_restore(flags);
return;
} else
- kmemleak_enabled = 1;
+ atomic_set(&kmemleak_enabled, 1);
local_irq_restore(flags);
/*
@@ -1878,9 +1826,9 @@ void __init kmemleak_init(void)
log->op_type);
}
- if (kmemleak_warning) {
+ if (atomic_read(&kmemleak_warning)) {
print_log_trace(log);
- kmemleak_warning = 0;
+ atomic_set(&kmemleak_warning, 0);
}
}
}
@@ -1892,9 +1840,9 @@ static int __init kmemleak_late_init(void)
{
struct dentry *dentry;
- kmemleak_initialized = 1;
+ atomic_set(&kmemleak_initialized, 1);
- if (kmemleak_error) {
+ if (atomic_read(&kmemleak_error)) {
/*
* Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
diff --git a/mm/ksm.c b/mm/ksm.c
index 346ddc9e4c0..51573858938 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,22 +33,13 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
-#include <linux/hashtable.h>
+#include <linux/hash.h>
#include <linux/freezer.h>
#include <linux/oom.h>
-#include <linux/numa.h>
#include <asm/tlbflush.h>
#include "internal.h"
-#ifdef CONFIG_NUMA
-#define NUMA(x) (x)
-#define DO_NUMA(x) do { (x); } while (0)
-#else
-#define NUMA(x) (0)
-#define DO_NUMA(x) do { } while (0)
-#endif
-
/*
* A few notes about the KSM scanning process,
* to make it easier to understand the data structures below:
@@ -87,9 +78,6 @@
* take 10 attempts to find a page in the unstable tree, once it is found,
* it is secured in the stable tree. (When we scan a new page, we first
* compare it against the stable tree, and then against the unstable tree.)
- *
- * If the merge_across_nodes tunable is unset, then KSM maintains multiple
- * stable trees and multiple unstable trees: one of each for each NUMA node.
*/
/**
@@ -125,32 +113,19 @@ struct ksm_scan {
/**
* struct stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
- * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
- * @list: linked into migrate_nodes, pending placement in the proper node tree
* @hlist: hlist head of rmap_items using this ksm page
- * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
- * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
+ * @kpfn: page frame number of this ksm page
*/
struct stable_node {
- union {
- struct rb_node node; /* when node of stable tree */
- struct { /* when listed for migration */
- struct list_head *head;
- struct list_head list;
- };
- };
+ struct rb_node node;
struct hlist_head hlist;
unsigned long kpfn;
-#ifdef CONFIG_NUMA
- int nid;
-#endif
};
/**
* struct rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
- * @nid: NUMA node id of unstable tree in which linked (may not match page)
* @mm: the memory structure this rmap_item is pointing into
* @address: the virtual address this rmap_item tracks (+ flags in low bits)
* @oldchecksum: previous checksum of the page at that virtual address
@@ -160,12 +135,7 @@ struct stable_node {
*/
struct rmap_item {
struct rmap_item *rmap_list;
- union {
- struct anon_vma *anon_vma; /* when stable */
-#ifdef CONFIG_NUMA
- int nid; /* when node of unstable tree */
-#endif
- };
+ struct anon_vma *anon_vma; /* when stable */
struct mm_struct *mm;
unsigned long address; /* + low bits used for flags below */
unsigned int oldchecksum; /* when unstable */
@@ -183,16 +153,12 @@ struct rmap_item {
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
/* The stable and unstable tree heads */
-static struct rb_root one_stable_tree[1] = { RB_ROOT };
-static struct rb_root one_unstable_tree[1] = { RB_ROOT };
-static struct rb_root *root_stable_tree = one_stable_tree;
-static struct rb_root *root_unstable_tree = one_unstable_tree;
-
-/* Recently migrated nodes of stable tree, pending proper placement */
-static LIST_HEAD(migrate_nodes);
+static struct rb_root root_stable_tree = RB_ROOT;
+static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_BITS 10
-static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+#define MM_SLOTS_HASH_SHIFT 10
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -223,21 +189,10 @@ static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
-#ifdef CONFIG_NUMA
-/* Zeroed when merging across nodes is not allowed */
-static unsigned int ksm_merge_across_nodes = 1;
-static int ksm_nr_node_ids = 1;
-#else
-#define ksm_merge_across_nodes 1U
-#define ksm_nr_node_ids 1
-#endif
-
#define KSM_RUN_STOP 0
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
-#define KSM_RUN_OFFLINE 4
-static unsigned long ksm_run = KSM_RUN_STOP;
-static void wait_while_offlining(void);
+static unsigned int ksm_run = KSM_RUN_STOP;
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
@@ -320,20 +275,31 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
- struct mm_slot *slot;
-
- hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
- if (slot->mm == mm)
- return slot;
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
+ hlist_for_each_entry(mm_slot, node, bucket, link) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
+ hlist_add_head(&mm_slot->link, bucket);
+}
+
+static inline int in_stable_tree(struct rmap_item *rmap_item)
+{
+ return rmap_item->address & STABLE_FLAG;
}
/*
@@ -367,7 +333,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
- page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
@@ -444,7 +410,7 @@ static void break_cow(struct rmap_item *rmap_item)
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
+ struct page *head = compound_trans_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
@@ -481,22 +447,12 @@ out: page = NULL;
return page;
}
-/*
- * This helper is used for getting right index into array of tree roots.
- * When merge_across_nodes knob is set to 1, there are only two rb-trees for
- * stable and unstable pages from all nodes with roots in index 0. Otherwise,
- * every node has its own stable and unstable tree.
- */
-static inline int get_kpfn_nid(unsigned long kpfn)
-{
- return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
-}
-
static void remove_node_from_stable_tree(struct stable_node *stable_node)
{
struct rmap_item *rmap_item;
+ struct hlist_node *hlist;
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
if (rmap_item->hlist.next)
ksm_pages_sharing--;
else
@@ -506,11 +462,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
cond_resched();
}
- if (stable_node->head == &migrate_nodes)
- list_del(&stable_node->list);
- else
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
+ rb_erase(&stable_node->node, &root_stable_tree);
free_stable_node(stable_node);
}
@@ -520,7 +472,6 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* In which case we can trust the content of the page, and it
* returns the gotten page; but if the page has now been zapped,
* remove the stale node from the stable tree and return NULL.
- * But beware, the stable node's page might be being migrated.
*
* You would expect the stable_node to hold a reference to the ksm page.
* But if it increments the page's count, swapping out has to wait for
@@ -531,77 +482,40 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* pointing back to this stable node. This relies on freeing a PageAnon
* page to reset its page->mapping to NULL, and relies on no other use of
* a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by ksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping). The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_ksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+static struct page *get_ksm_page(struct stable_node *stable_node)
{
struct page *page;
void *expected_mapping;
- unsigned long kpfn;
+ page = pfn_to_page(stable_node->kpfn);
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-again:
- kpfn = ACCESS_ONCE(stable_node->kpfn);
- page = pfn_to_page(kpfn);
-
- /*
- * page is computed from kpfn, so on most architectures reading
- * page->mapping is naturally ordered after reading node->kpfn,
- * but on Alpha we need to be more careful.
- */
- smp_read_barrier_depends();
- if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ rcu_read_lock();
+ if (page->mapping != expected_mapping)
goto stale;
-
- /*
- * We cannot do anything with the page while its refcount is 0.
- * Usually 0 means free, or tail of a higher-order page: in which
- * case this node is no longer referenced, and should be freed;
- * however, it might mean that the page is under page_freeze_refs().
- * The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
- */
- while (!get_page_unless_zero(page)) {
- /*
- * Another check for page->mapping != expected_mapping would
- * work here too. We have chosen the !PageSwapCache test to
- * optimize the common case, when the page is or is about to
- * be freed: PageSwapCache is cleared (under spin_lock_irq)
- * in the freeze_refs section of __remove_mapping(); but Anon
- * page->mapping reset to NULL later, in free_pages_prepare().
- */
- if (!PageSwapCache(page))
- goto stale;
- cpu_relax();
- }
-
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (!get_page_unless_zero(page))
+ goto stale;
+ if (page->mapping != expected_mapping) {
put_page(page);
goto stale;
}
-
- if (lock_it) {
- lock_page(page);
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
- unlock_page(page);
- put_page(page);
- goto stale;
- }
- }
+ rcu_read_unlock();
return page;
-
stale:
- /*
- * We come here from above when page->mapping or !PageSwapCache
- * suggests that the node is stale; but it might be under migration.
- * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
- * before checking whether node->kpfn has been changed.
- */
- smp_rmb();
- if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
- goto again;
+ rcu_read_unlock();
remove_node_from_stable_tree(stable_node);
return NULL;
}
@@ -617,10 +531,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node);
if (!page)
goto out;
+ lock_page(page);
hlist_del(&rmap_item->hlist);
unlock_page(page);
put_page(page);
@@ -645,8 +560,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
BUG_ON(age > 1);
if (!age)
- rb_erase(&rmap_item->node,
- root_unstable_tree + NUMA(rmap_item->nid));
+ rb_erase(&rmap_item->node, &root_unstable_tree);
+
ksm_pages_unshared--;
rmap_item->address &= PAGE_MASK;
}
@@ -666,7 +581,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
}
/*
- * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -699,71 +614,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
/*
* Only called through the sysfs control interface:
*/
-static int remove_stable_node(struct stable_node *stable_node)
-{
- struct page *page;
- int err;
-
- page = get_ksm_page(stable_node, true);
- if (!page) {
- /*
- * get_ksm_page did remove_node_from_stable_tree itself.
- */
- return 0;
- }
-
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
- /*
- * The stable node did not yet appear stale to get_ksm_page(),
- * since that allows for an unmapped ksm page to be recognized
- * right up until it is freed; but the node is safe to remove.
- * This page might be in a pagevec waiting to be freed,
- * or it might be PageSwapCache (perhaps under writeback),
- * or it might have been removed from swapcache a moment ago.
- */
- set_page_stable_node(page, NULL);
- remove_node_from_stable_tree(stable_node);
- err = 0;
- }
-
- unlock_page(page);
- put_page(page);
- return err;
-}
-
-static int remove_all_stable_nodes(void)
-{
- struct stable_node *stable_node;
- struct list_head *this, *next;
- int nid;
- int err = 0;
-
- for (nid = 0; nid < ksm_nr_node_ids; nid++) {
- while (root_stable_tree[nid].rb_node) {
- stable_node = rb_entry(root_stable_tree[nid].rb_node,
- struct stable_node, node);
- if (remove_stable_node(stable_node)) {
- err = -EBUSY;
- break; /* proceed to next nid */
- }
- cond_resched();
- }
- }
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
- if (remove_stable_node(stable_node))
- err = -EBUSY;
- cond_resched();
- }
- return err;
-}
-
static int unmerge_and_remove_all_rmap_items(void)
{
struct mm_slot *mm_slot;
@@ -797,7 +647,7 @@ static int unmerge_and_remove_all_rmap_items(void)
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
struct mm_slot, mm_list);
if (ksm_test_exit(mm)) {
- hash_del(&mm_slot->link);
+ hlist_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -811,8 +661,6 @@ static int unmerge_and_remove_all_rmap_items(void)
}
}
- /* Clean up stable nodes, but don't worry if some are still busy */
- remove_all_stable_nodes();
ksm_scan.seqnr = 0;
return 0;
@@ -945,6 +793,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ BUG_ON(pmd_trans_huge(*pmd));
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
@@ -1097,9 +946,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
if (err)
goto out;
- /* Unstable nid is in union with stable anon_vma: remove first */
- remove_rmap_item_from_tree(rmap_item);
-
/* Must get reference to anon_vma while still holding mmap_sem */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
@@ -1150,99 +996,42 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
*/
static struct page *stable_tree_search(struct page *page)
{
- int nid;
- struct rb_root *root;
- struct rb_node **new;
- struct rb_node *parent;
+ struct rb_node *node = root_stable_tree.rb_node;
struct stable_node *stable_node;
- struct stable_node *page_node;
- page_node = page_stable_node(page);
- if (page_node && page_node->head != &migrate_nodes) {
- /* ksm page forked */
+ stable_node = page_stable_node(page);
+ if (stable_node) { /* ksm page forked */
get_page(page);
return page;
}
- nid = get_kpfn_nid(page_to_pfn(page));
- root = root_stable_tree + nid;
-again:
- new = &root->rb_node;
- parent = NULL;
-
- while (*new) {
+ while (node) {
struct page *tree_page;
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ stable_node = rb_entry(node, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node);
if (!tree_page)
return NULL;
ret = memcmp_pages(page, tree_page);
- put_page(tree_page);
- parent = *new;
- if (ret < 0)
- new = &parent->rb_left;
- else if (ret > 0)
- new = &parent->rb_right;
- else {
- /*
- * Lock and unlock the stable_node's page (which
- * might already have been migrated) so that page
- * migration is sure to notice its raised count.
- * It would be more elegant to return stable_node
- * than kpage, but that involves more changes.
- */
- tree_page = get_ksm_page(stable_node, true);
- if (tree_page) {
- unlock_page(tree_page);
- if (get_kpfn_nid(stable_node->kpfn) !=
- NUMA(stable_node->nid)) {
- put_page(tree_page);
- goto replace;
- }
- return tree_page;
- }
- /*
- * There is now a place for page_node, but the tree may
- * have been rebalanced, so re-evaluate parent and new.
- */
- if (page_node)
- goto again;
- return NULL;
- }
+ if (ret < 0) {
+ put_page(tree_page);
+ node = node->rb_left;
+ } else if (ret > 0) {
+ put_page(tree_page);
+ node = node->rb_right;
+ } else
+ return tree_page;
}
- if (!page_node)
- return NULL;
-
- list_del(&page_node->list);
- DO_NUMA(page_node->nid = nid);
- rb_link_node(&page_node->node, parent, new);
- rb_insert_color(&page_node->node, root);
- get_page(page);
- return page;
-
-replace:
- if (page_node) {
- list_del(&page_node->list);
- DO_NUMA(page_node->nid = nid);
- rb_replace_node(&stable_node->node, &page_node->node, root);
- get_page(page);
- } else {
- rb_erase(&stable_node->node, root);
- page = NULL;
- }
- stable_node->head = &migrate_nodes;
- list_add(&stable_node->list, stable_node->head);
- return page;
+ return NULL;
}
/*
- * stable_tree_insert - insert stable tree node pointing to new ksm page
+ * stable_tree_insert - insert rmap_item pointing to new ksm page
* into the stable tree.
*
* This function returns the stable tree node just allocated on success,
@@ -1250,25 +1039,17 @@ replace:
*/
static struct stable_node *stable_tree_insert(struct page *kpage)
{
- int nid;
- unsigned long kpfn;
- struct rb_root *root;
- struct rb_node **new;
+ struct rb_node **new = &root_stable_tree.rb_node;
struct rb_node *parent = NULL;
struct stable_node *stable_node;
- kpfn = page_to_pfn(kpage);
- nid = get_kpfn_nid(kpfn);
- root = root_stable_tree + nid;
- new = &root->rb_node;
-
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
- tree_page = get_ksm_page(stable_node, false);
+ tree_page = get_ksm_page(stable_node);
if (!tree_page)
return NULL;
@@ -1294,12 +1075,13 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
if (!stable_node)
return NULL;
+ rb_link_node(&stable_node->node, parent, new);
+ rb_insert_color(&stable_node->node, &root_stable_tree);
+
INIT_HLIST_HEAD(&stable_node->hlist);
- stable_node->kpfn = kpfn;
+
+ stable_node->kpfn = page_to_pfn(kpage);
set_page_stable_node(kpage, stable_node);
- DO_NUMA(stable_node->nid = nid);
- rb_link_node(&stable_node->node, parent, new);
- rb_insert_color(&stable_node->node, root);
return stable_node;
}
@@ -1322,15 +1104,10 @@ static
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
+
{
- struct rb_node **new;
- struct rb_root *root;
+ struct rb_node **new = &root_unstable_tree.rb_node;
struct rb_node *parent = NULL;
- int nid;
-
- nid = get_kpfn_nid(page_to_pfn(page));
- root = root_unstable_tree + nid;
- new = &root->rb_node;
while (*new) {
struct rmap_item *tree_rmap_item;
@@ -1360,15 +1137,6 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
} else if (ret > 0) {
put_page(tree_page);
new = &parent->rb_right;
- } else if (!ksm_merge_across_nodes &&
- page_to_nid(tree_page) != nid) {
- /*
- * If tree_page has been migrated to another NUMA node,
- * it will be flushed out and put in the right unstable
- * tree next time: only merge with it when across_nodes.
- */
- put_page(tree_page);
- return NULL;
} else {
*tree_pagep = tree_page;
return tree_rmap_item;
@@ -1377,9 +1145,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
rmap_item->address |= UNSTABLE_FLAG;
rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
- DO_NUMA(rmap_item->nid = nid);
rb_link_node(&rmap_item->node, parent, new);
- rb_insert_color(&rmap_item->node, root);
+ rb_insert_color(&rmap_item->node, &root_unstable_tree);
ksm_pages_unshared++;
return NULL;
@@ -1421,29 +1188,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
unsigned int checksum;
int err;
- stable_node = page_stable_node(page);
- if (stable_node) {
- if (stable_node->head != &migrate_nodes &&
- get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
- rb_erase(&stable_node->node,
- root_stable_tree + NUMA(stable_node->nid));
- stable_node->head = &migrate_nodes;
- list_add(&stable_node->list, stable_node->head);
- }
- if (stable_node->head != &migrate_nodes &&
- rmap_item->head == stable_node)
- return;
- }
+ remove_rmap_item_from_tree(rmap_item);
/* We first start with searching the page inside the stable tree */
kpage = stable_tree_search(page);
- if (kpage == page && rmap_item->head == stable_node) {
- put_page(kpage);
- return;
- }
-
- remove_rmap_item_from_tree(rmap_item);
-
if (kpage) {
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
@@ -1477,11 +1225,14 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
+ /*
+ * As soon as we merge this page, we want to remove the
+ * rmap_item of the page we have merged with from the unstable
+ * tree, and insert it instead as new node in the stable tree.
+ */
if (kpage) {
- /*
- * The pages were successfully merged: insert new
- * node in the stable tree and add both rmap_items.
- */
+ remove_rmap_item_from_tree(tree_rmap_item);
+
lock_page(kpage);
stable_node = stable_tree_insert(kpage);
if (stable_node) {
@@ -1538,7 +1289,6 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
- int nid;
if (list_empty(&ksm_mm_head.mm_list))
return NULL;
@@ -1557,29 +1307,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
*/
lru_add_drain_all();
- /*
- * Whereas stale stable_nodes on the stable_tree itself
- * get pruned in the regular course of stable_tree_search(),
- * those moved out to the migrate_nodes list can accumulate:
- * so prune them once before each full scan.
- */
- if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node;
- struct list_head *this, *next;
- struct page *page;
-
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this,
- struct stable_node, list);
- page = get_ksm_page(stable_node, false);
- if (page)
- put_page(page);
- cond_resched();
- }
- }
-
- for (nid = 0; nid < ksm_nr_node_ids; nid++)
- root_unstable_tree[nid] = RB_ROOT;
+ root_unstable_tree = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1664,7 +1392,7 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
- hash_del(&slot->link);
+ hlist_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
@@ -1700,7 +1428,8 @@ static void ksm_do_scan(unsigned int scan_npages)
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
- cmp_and_merge_page(page, rmap_item);
+ if (!PageKsm(page) || !in_stable_tree(rmap_item))
+ cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
@@ -1717,7 +1446,6 @@ static int ksm_scan_thread(void *nothing)
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
- wait_while_offlining();
if (ksmd_should_run())
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
@@ -1797,19 +1525,11 @@ int __ksm_enter(struct mm_struct *mm)
spin_lock(&ksm_mmlist_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
- * When KSM_RUN_MERGE (or KSM_RUN_STOP),
- * insert just behind the scanning cursor, to let the area settle
+ * Insert just behind the scanning cursor, to let the area settle
* down a little; when fork is followed by immediate exec, we don't
* want ksmd to waste time setting up and tearing down an rmap_list.
- *
- * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
- * scanning cursor, otherwise KSM pages in newly forked mms will be
- * missed: then we might as well insert at the end of the list.
*/
- if (ksm_run & KSM_RUN_UNMERGE)
- list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
- else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1839,7 +1559,7 @@ void __ksm_exit(struct mm_struct *mm)
mm_slot = get_mm_slot(mm);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
- hash_del(&mm_slot->link);
+ hlist_del(&mm_slot->link);
list_del(&mm_slot->mm_list);
easy_to_free = 1;
} else {
@@ -1859,57 +1579,47 @@ void __ksm_exit(struct mm_struct *mm)
}
}
-struct page *ksm_might_need_to_copy(struct page *page,
+struct page *ksm_does_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- struct anon_vma *anon_vma = page_anon_vma(page);
struct page *new_page;
- if (PageKsm(page)) {
- if (page_stable_node(page) &&
- !(ksm_run & KSM_RUN_UNMERGE))
- return page; /* no need to copy it */
- } else if (!anon_vma) {
- return page; /* no need to copy it */
- } else if (anon_vma->root == vma->anon_vma->root &&
- page->index == linear_page_index(vma, address)) {
- return page; /* still no need to copy it */
- }
- if (!PageUptodate(page))
- return page; /* let do_swap_page report the error */
-
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
SetPageDirty(new_page);
__SetPageUptodate(new_page);
+ SetPageSwapBacked(new_page);
__set_page_locked(new_page);
+
+ if (!mlocked_vma_newpage(vma, new_page))
+ lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+ else
+ add_page_to_unevictable_list(new_page);
}
return new_page;
}
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
+ struct hlist_node *hlist;
+ unsigned int mapcount = page_mapcount(page);
+ int referenced = 0;
int search_new_forks = 0;
- VM_BUG_ON_PAGE(!PageKsm(page), page);
-
- /*
- * Rely on the page lock to protect against concurrent modifications
- * to that page's node of the stable tree.
- */
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
stable_node = page_stable_node(page);
if (!stable_node)
- return ret;
+ return 0;
again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
@@ -1930,16 +1640,115 @@ again:
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
- if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
continue;
- ret = rwc->rmap_one(page, vma,
- rmap_item->address, rwc->arg);
- if (ret != SWAP_AGAIN) {
+ referenced += page_referenced_one(page, vma,
+ rmap_item->address, &mapcount, vm_flags);
+ if (!search_new_forks || !mapcount)
+ break;
+ }
+ anon_vma_unlock_read(anon_vma);
+ if (!mapcount)
+ goto out;
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return referenced;
+}
+
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return SWAP_FAIL;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
+ struct vm_area_struct *vma;
+
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
+ vma = vmac->vma;
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ ret = try_to_unmap_one(page, vma,
+ rmap_item->address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page)) {
anon_vma_unlock_read(anon_vma);
goto out;
}
- if (rwc->done && rwc->done(page)) {
+ }
+ anon_vma_unlock_read(anon_vma);
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+ struct stable_node *stable_node;
+ struct hlist_node *hlist;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON(!PageKsm(page));
+ VM_BUG_ON(!PageLocked(page));
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return ret;
+again:
+ hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
+ struct vm_area_struct *vma;
+
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
+ vma = vmac->vma;
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ ret = rmap_one(page, vma, rmap_item->address, arg);
+ if (ret != SWAP_AGAIN) {
anon_vma_unlock_read(anon_vma);
goto out;
}
@@ -1952,128 +1761,76 @@ out:
return ret;
}
-#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
- VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+ VM_BUG_ON(!PageLocked(oldpage));
+ VM_BUG_ON(!PageLocked(newpage));
+ VM_BUG_ON(newpage->mapping != oldpage->mapping);
stable_node = page_stable_node(newpage);
if (stable_node) {
- VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
stable_node->kpfn = page_to_pfn(newpage);
- /*
- * newpage->mapping was set in advance; now we need smp_wmb()
- * to make sure that the new stable_node->kpfn is visible
- * to get_ksm_page() before it can see that oldpage->mapping
- * has gone stale (or that PageSwapCache has been cleared).
- */
- smp_wmb();
- set_page_stable_node(oldpage, NULL);
}
}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_HOTREMOVE
-static int just_wait(void *word)
+static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
{
- schedule();
- return 0;
-}
+ struct rb_node *node;
-static void wait_while_offlining(void)
-{
- while (ksm_run & KSM_RUN_OFFLINE) {
- mutex_unlock(&ksm_thread_mutex);
- wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
- just_wait, TASK_UNINTERRUPTIBLE);
- mutex_lock(&ksm_thread_mutex);
- }
-}
+ for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
+ struct stable_node *stable_node;
-static void ksm_check_stable_tree(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct stable_node *stable_node;
- struct list_head *this, *next;
- struct rb_node *node;
- int nid;
-
- for (nid = 0; nid < ksm_nr_node_ids; nid++) {
- node = rb_first(root_stable_tree + nid);
- while (node) {
- stable_node = rb_entry(node, struct stable_node, node);
- if (stable_node->kpfn >= start_pfn &&
- stable_node->kpfn < end_pfn) {
- /*
- * Don't get_ksm_page, page has already gone:
- * which is why we keep kpfn instead of page*
- */
- remove_node_from_stable_tree(stable_node);
- node = rb_first(root_stable_tree + nid);
- } else
- node = rb_next(node);
- cond_resched();
- }
- }
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
+ stable_node = rb_entry(node, struct stable_node, node);
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
- remove_node_from_stable_tree(stable_node);
- cond_resched();
+ return stable_node;
}
+ return NULL;
}
static int ksm_memory_callback(struct notifier_block *self,
unsigned long action, void *arg)
{
struct memory_notify *mn = arg;
+ struct stable_node *stable_node;
switch (action) {
case MEM_GOING_OFFLINE:
/*
- * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
- * and remove_all_stable_nodes() while memory is going offline:
- * it is unsafe for them to touch the stable tree at this time.
- * But unmerge_ksm_pages(), rmap lookups and other entry points
- * which do not need the ksm_thread_mutex are all safe.
+ * Keep it very simple for now: just lock out ksmd and
+ * MADV_UNMERGEABLE while any memory is going offline.
+ * mutex_lock_nested() is necessary because lockdep was alarmed
+ * that here we take ksm_thread_mutex inside notifier chain
+ * mutex, and later take notifier chain mutex inside
+ * ksm_thread_mutex to unlock it. But that's safe because both
+ * are inside mem_hotplug_mutex.
*/
- mutex_lock(&ksm_thread_mutex);
- ksm_run |= KSM_RUN_OFFLINE;
- mutex_unlock(&ksm_thread_mutex);
+ mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
break;
case MEM_OFFLINE:
/*
* Most of the work is done by page migration; but there might
* be a few stable_nodes left over, still pointing to struct
- * pages which have been offlined: prune those from the tree,
- * otherwise get_ksm_page() might later try to access a
- * non-existent struct page.
+ * pages which have been offlined: prune those from the tree.
*/
- ksm_check_stable_tree(mn->start_pfn,
- mn->start_pfn + mn->nr_pages);
+ while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages)) != NULL)
+ remove_node_from_stable_tree(stable_node);
/* fallthrough */
case MEM_CANCEL_OFFLINE:
- mutex_lock(&ksm_thread_mutex);
- ksm_run &= ~KSM_RUN_OFFLINE;
mutex_unlock(&ksm_thread_mutex);
-
- smp_mb(); /* wake_up_bit advises this */
- wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
break;
}
return NOTIFY_OK;
}
-#else
-static void wait_while_offlining(void)
-{
-}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_SYSFS
@@ -2100,7 +1857,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
unsigned long msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
+ err = strict_strtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
@@ -2123,7 +1880,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
int err;
unsigned long nr_pages;
- err = kstrtoul(buf, 10, &nr_pages);
+ err = strict_strtoul(buf, 10, &nr_pages);
if (err || nr_pages > UINT_MAX)
return -EINVAL;
@@ -2136,7 +1893,7 @@ KSM_ATTR(pages_to_scan);
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", ksm_run);
+ return sprintf(buf, "%u\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -2145,7 +1902,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
int err;
unsigned long flags;
- err = kstrtoul(buf, 10, &flags);
+ err = strict_strtoul(buf, 10, &flags);
if (err || flags > UINT_MAX)
return -EINVAL;
if (flags > KSM_RUN_UNMERGE)
@@ -2159,7 +1916,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
*/
mutex_lock(&ksm_thread_mutex);
- wait_while_offlining();
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
@@ -2181,64 +1937,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
}
KSM_ATTR(run);
-#ifdef CONFIG_NUMA
-static ssize_t merge_across_nodes_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sprintf(buf, "%u\n", ksm_merge_across_nodes);
-}
-
-static ssize_t merge_across_nodes_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- int err;
- unsigned long knob;
-
- err = kstrtoul(buf, 10, &knob);
- if (err)
- return err;
- if (knob > 1)
- return -EINVAL;
-
- mutex_lock(&ksm_thread_mutex);
- wait_while_offlining();
- if (ksm_merge_across_nodes != knob) {
- if (ksm_pages_shared || remove_all_stable_nodes())
- err = -EBUSY;
- else if (root_stable_tree == one_stable_tree) {
- struct rb_root *buf;
- /*
- * This is the first time that we switch away from the
- * default of merging across nodes: must now allocate
- * a buffer to hold as many roots as may be needed.
- * Allocate stable and unstable together:
- * MAXSMP NODES_SHIFT 10 will use 16kB.
- */
- buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
- GFP_KERNEL);
- /* Let us assume that RB_ROOT is NULL is zero */
- if (!buf)
- err = -ENOMEM;
- else {
- root_stable_tree = buf;
- root_unstable_tree = buf + nr_node_ids;
- /* Stable tree is empty but not the unstable */
- root_unstable_tree[0] = one_unstable_tree[0];
- }
- }
- if (!err) {
- ksm_merge_across_nodes = knob;
- ksm_nr_node_ids = knob ? 1 : nr_node_ids;
- }
- }
- mutex_unlock(&ksm_thread_mutex);
-
- return err ? err : count;
-}
-KSM_ATTR(merge_across_nodes);
-#endif
-
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -2293,9 +1991,6 @@ static struct attribute *ksm_attrs[] = {
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
&full_scans_attr.attr,
-#ifdef CONFIG_NUMA
- &merge_across_nodes_attr.attr,
-#endif
NULL,
};
@@ -2334,7 +2029,10 @@ static int __init ksm_init(void)
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_MEMORY_HOTREMOVE
- /* There is no significance to this priority 100 */
+ /*
+ * Choose a high priority since the callback takes ksm_thread_mutex:
+ * later callbacks could only be taking locks which nest within that.
+ */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
return 0;
@@ -2344,4 +2042,4 @@ out_free:
out:
return err;
}
-subsys_initcall(ksm_init);
+module_init(ksm_init)
diff --git a/mm/list_lru.c b/mm/list_lru.c
deleted file mode 100644
index f1a0db19417..00000000000
--- a/mm/list_lru.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
- * Authors: David Chinner and Glauber Costa
- *
- * Generic LRU infrastructure
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/list_lru.h>
-#include <linux/slab.h>
-
-bool list_lru_add(struct list_lru *lru, struct list_head *item)
-{
- int nid = page_to_nid(virt_to_page(item));
- struct list_lru_node *nlru = &lru->node[nid];
-
- spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- if (list_empty(item)) {
- list_add_tail(item, &nlru->list);
- if (nlru->nr_items++ == 0)
- node_set(nid, lru->active_nodes);
- spin_unlock(&nlru->lock);
- return true;
- }
- spin_unlock(&nlru->lock);
- return false;
-}
-EXPORT_SYMBOL_GPL(list_lru_add);
-
-bool list_lru_del(struct list_lru *lru, struct list_head *item)
-{
- int nid = page_to_nid(virt_to_page(item));
- struct list_lru_node *nlru = &lru->node[nid];
-
- spin_lock(&nlru->lock);
- if (!list_empty(item)) {
- list_del_init(item);
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
- spin_unlock(&nlru->lock);
- return true;
- }
- spin_unlock(&nlru->lock);
- return false;
-}
-EXPORT_SYMBOL_GPL(list_lru_del);
-
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
-{
- unsigned long count = 0;
- struct list_lru_node *nlru = &lru->node[nid];
-
- spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- count += nlru->nr_items;
- spin_unlock(&nlru->lock);
-
- return count;
-}
-EXPORT_SYMBOL_GPL(list_lru_count_node);
-
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long *nr_to_walk)
-{
-
- struct list_lru_node *nlru = &lru->node[nid];
- struct list_head *item, *n;
- unsigned long isolated = 0;
-
- spin_lock(&nlru->lock);
-restart:
- list_for_each_safe(item, n, &nlru->list) {
- enum lru_status ret;
-
- /*
- * decrement nr_to_walk first so that we don't livelock if we
- * get stuck on large numbesr of LRU_RETRY items
- */
- if (!*nr_to_walk)
- break;
- --*nr_to_walk;
-
- ret = isolate(item, &nlru->lock, cb_arg);
- switch (ret) {
- case LRU_REMOVED_RETRY:
- assert_spin_locked(&nlru->lock);
- case LRU_REMOVED:
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
- isolated++;
- /*
- * If the lru lock has been dropped, our list
- * traversal is now invalid and so we have to
- * restart from scratch.
- */
- if (ret == LRU_REMOVED_RETRY)
- goto restart;
- break;
- case LRU_ROTATE:
- list_move_tail(item, &nlru->list);
- break;
- case LRU_SKIP:
- break;
- case LRU_RETRY:
- /*
- * The lru lock has been dropped, our list traversal is
- * now invalid and so we have to restart from scratch.
- */
- assert_spin_locked(&nlru->lock);
- goto restart;
- default:
- BUG();
- }
- }
-
- spin_unlock(&nlru->lock);
- return isolated;
-}
-EXPORT_SYMBOL_GPL(list_lru_walk_node);
-
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
-{
- int i;
- size_t size = sizeof(*lru->node) * nr_node_ids;
-
- lru->node = kzalloc(size, GFP_KERNEL);
- if (!lru->node)
- return -ENOMEM;
-
- nodes_clear(lru->active_nodes);
- for (i = 0; i < nr_node_ids; i++) {
- spin_lock_init(&lru->node[i].lock);
- if (key)
- lockdep_set_class(&lru->node[i].lock, key);
- INIT_LIST_HEAD(&lru->node[i].list);
- lru->node[i].nr_items = 0;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(list_lru_init_key);
-
-void list_lru_destroy(struct list_lru *lru)
-{
- kfree(lru->node);
-}
-EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index a402f8fdc68..03dfa5c7adb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,9 +16,6 @@
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/blkdev.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -42,11 +39,11 @@ static int madvise_need_mmap_write(int behavior)
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
-static long madvise_behavior(struct vm_area_struct *vma,
+static long madvise_behavior(struct vm_area_struct * vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
@@ -134,105 +131,15 @@ out:
return error;
}
-#ifdef CONFIG_SWAP
-static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- pte_t *orig_pte;
- struct vm_area_struct *vma = walk->private;
- unsigned long index;
-
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- return 0;
-
- for (index = start; index != end; index += PAGE_SIZE) {
- pte_t pte;
- swp_entry_t entry;
- struct page *page;
- spinlock_t *ptl;
-
- orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- pte = *(orig_pte + ((index - start) / PAGE_SIZE));
- pte_unmap_unlock(orig_pte, ptl);
-
- if (pte_present(pte) || pte_none(pte) || pte_file(pte))
- continue;
- entry = pte_to_swp_entry(pte);
- if (unlikely(non_swap_entry(entry)))
- continue;
-
- page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index);
- if (page)
- page_cache_release(page);
- }
-
- return 0;
-}
-
-static void force_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- struct mm_walk walk = {
- .mm = vma->vm_mm,
- .pmd_entry = swapin_walk_pmd_entry,
- .private = vma,
- };
-
- walk_page_range(start, end, &walk);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
-
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct address_space *mapping)
-{
- pgoff_t index;
- struct page *page;
- swp_entry_t swap;
-
- for (; start < end; start += PAGE_SIZE) {
- index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-
- page = find_get_entry(mapping, index);
- if (!radix_tree_exceptional_entry(page)) {
- if (page)
- page_cache_release(page);
- continue;
- }
- swap = radix_to_swp_entry(page);
- page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0);
- if (page)
- page_cache_release(page);
- }
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
-#endif /* CONFIG_SWAP */
-
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
+static long madvise_willneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
-#ifdef CONFIG_SWAP
- if (!file || mapping_cap_swap_backed(file->f_mapping)) {
- *prev = vma;
- if (!file)
- force_swapin_readahead(vma, start, end);
- else
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
- return 0;
- }
-#endif
-
if (!file)
return -EBADF;
@@ -270,8 +177,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
+static long madvise_dontneed(struct vm_area_struct * vma,
+ struct vm_area_struct ** prev,
unsigned long start, unsigned long end)
{
*prev = vma;
@@ -343,35 +250,29 @@ static long madvise_remove(struct vm_area_struct *vma,
*/
static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
{
- struct page *p;
+ int ret = 0;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- for (; start < end; start += PAGE_SIZE <<
- compound_order(compound_head(p))) {
- int ret;
-
- ret = get_user_pages_fast(start, 1, 0, &p);
+ for (; start < end; start += PAGE_SIZE) {
+ struct page *p;
+ int ret = get_user_pages_fast(start, 1, 0, &p);
if (ret != 1)
return ret;
-
- if (PageHWPoison(p)) {
- put_page(p);
- continue;
- }
if (bhv == MADV_SOFT_OFFLINE) {
- pr_info("Soft offlining page %#lx at %#lx\n",
+ printk(KERN_INFO "Soft offlining page %lx at %lx\n",
page_to_pfn(p), start);
ret = soft_offline_page(p, MF_COUNT_INCREASED);
if (ret)
- return ret;
+ break;
continue;
}
- pr_info("Injecting memory failure for page %#lx at %#lx\n",
+ printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
page_to_pfn(p), start);
/* Ignore return value for now */
memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
}
- return 0;
+ return ret;
}
#endif
@@ -465,12 +366,11 @@ madvise_behavior_valid(int behavior)
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
unsigned long end, tmp;
- struct vm_area_struct *vma, *prev;
+ struct vm_area_struct * vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
int write;
size_t len;
- struct blk_plug plug;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -479,27 +379,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (!madvise_behavior_valid(behavior))
return error;
+ write = madvise_need_mmap_write(behavior);
+ if (write)
+ down_write(&current->mm->mmap_sem);
+ else
+ down_read(&current->mm->mmap_sem);
+
if (start & ~PAGE_MASK)
- return error;
+ goto out;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return error;
+ goto out;
end = start + len;
if (end < start)
- return error;
+ goto out;
error = 0;
if (end == start)
- return error;
-
- write = madvise_need_mmap_write(behavior);
- if (write)
- down_write(&current->mm->mmap_sem);
- else
- down_read(&current->mm->mmap_sem);
+ goto out;
/*
* If the interval [start,end) covers some unmapped address
@@ -510,7 +410,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (vma && start > vma->vm_start)
prev = vma;
- blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
@@ -546,7 +445,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
vma = find_vma(current->mm, start);
}
out:
- blk_finish_plug(&plug);
if (write)
up_write(&current->mm->mmap_sem);
else
diff --git a/mm/memblock.c b/mm/memblock.c
index 6d2f219a48b..88adc8afb61 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,16 +20,8 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
-#include <asm-generic/sections.h>
-#include <linux/io.h>
-
-#include "internal.h"
-
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
-#endif
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
@@ -40,20 +32,10 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- .physmem.regions = memblock_physmem_init_regions,
- .physmem.cnt = 1, /* empty dummy entry */
- .physmem.max = INIT_PHYSMEM_REGIONS,
-#endif
-
- .bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
int memblock_debug __initdata_memblock;
-#ifdef CONFIG_MOVABLE_NODE
-bool movable_node_enabled __initdata_memblock = false;
-#endif
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -100,57 +82,33 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
return (i < type->cnt) ? i : -1;
}
-/*
- * __memblock_find_range_bottom_up - find free area utility in bottom-up
+/**
+ * memblock_find_in_range_node - find free area in given range and node
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
*
- * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ * Find @size free area aligned to @align in the specified range and node.
*
* RETURNS:
- * Found address on success, 0 on failure.
+ * Found address on success, %0 on failure.
*/
-static phys_addr_t __init_memblock
-__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+ phys_addr_t end, phys_addr_t size,
+ phys_addr_t align, int nid)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
- this_start = clamp(this_start, start, end);
- this_end = clamp(this_end, start, end);
-
- cand = round_up(this_start, align);
- if (cand < this_end && this_end - cand >= size)
- return cand;
- }
-
- return 0;
-}
+ /* pump up @end */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
-/**
- * __memblock_find_range_top_down - find free area utility, in top-down
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
- * @size: size of free area to find
- * @align: alignment of free area to find
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Utility called from memblock_find_in_range_node(), find free area top-down.
- *
- * RETURNS:
- * Found address on success, 0 on failure.
- */
-static phys_addr_t __init_memblock
-__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
-{
- phys_addr_t this_start, this_end, cand;
- u64 i;
+ /* avoid allocating the first page */
+ start = max_t(phys_addr_t, start, PAGE_SIZE);
+ end = max(start, end);
for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
@@ -163,81 +121,10 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
if (cand >= this_start)
return cand;
}
-
return 0;
}
/**
- * memblock_find_in_range_node - find free area in given range and node
- * @size: size of free area to find
- * @align: alignment of free area to find
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Find @size free area aligned to @align in the specified range and node.
- *
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
- * RETURNS:
- * Found address on success, 0 on failure.
- */
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
- phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
-{
- int ret;
- phys_addr_t kernel_end;
-
- /* pump up @end */
- if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
- end = memblock.current_limit;
-
- /* avoid allocating the first page */
- start = max_t(phys_addr_t, start, PAGE_SIZE);
- end = max(start, end);
- kernel_end = __pa_symbol(_end);
-
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
-
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
- if (ret)
- return ret;
-
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
- }
-
- return __memblock_find_range_top_down(start, end, size, align, nid);
-}
-
-/**
* memblock_find_in_range - find free area in given range
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
@@ -247,14 +134,14 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
- * Found address on success, 0 on failure.
+ * Found address on success, %0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(size, align, start, end,
- NUMA_NO_NODE);
+ return memblock_find_in_range_node(start, end, size, align,
+ MAX_NUMNODES);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -270,13 +157,10 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
- type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
phys_addr_t *addr)
{
@@ -289,20 +173,6 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
memblock.reserved.max);
}
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
- phys_addr_t *addr)
-{
- if (memblock.memory.regions == memblock_memory_init_regions)
- return 0;
-
- *addr = __pa(memblock.memory.regions);
-
- return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
-}
-
-#endif
-
/**
* memblock_double_array - double the size of the memblock regions array
* @type: memblock type of the regions array being doubled
@@ -437,8 +307,7 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
- memblock_get_region_node(next) ||
- this->flags != next->flags) {
+ memblock_get_region_node(next)) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -453,20 +322,17 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
/**
* memblock_insert_region - insert new memblock region
- * @type: memblock type to insert into
- * @idx: index for the insertion point
- * @base: base address of the new region
- * @size: size of the new region
- * @nid: node id of the new region
- * @flags: flags of the new region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
* @type must already have extra room to accomodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
- phys_addr_t size,
- int nid, unsigned long flags)
+ phys_addr_t size, int nid)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -474,19 +340,17 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
- rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
}
/**
- * memblock_add_range - add new memblock region
+ * memblock_add_region - add new memblock region
* @type: memblock type to add new region into
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
- * @flags: flags of the new region
*
* Add new memblock region [@base,@base+@size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
@@ -496,9 +360,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init_memblock memblock_add_range(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size,
- int nid, unsigned long flags)
+static int __init_memblock memblock_add_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size, int nid)
{
bool insert = false;
phys_addr_t obase = base;
@@ -513,7 +376,6 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
- type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
@@ -544,8 +406,7 @@ repeat:
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
- rbase - base, nid,
- flags);
+ rbase - base, nid);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -555,8 +416,7 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base,
- nid, flags);
+ memblock_insert_region(type, i, base, end - base, nid);
}
/*
@@ -578,13 +438,12 @@ repeat:
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
- return memblock_add_range(&memblock.memory, base, size, nid, 0);
+ return memblock_add_region(&memblock.memory, base, size, nid);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_range(&memblock.memory, base, size,
- MAX_NUMNODES, 0);
+ return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
}
/**
@@ -639,8 +498,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= base - rbase;
type->total_size -= base - rbase;
memblock_insert_region(type, i, rbase, base - rbase,
- memblock_get_region_node(rgn),
- rgn->flags);
+ memblock_get_region_node(rgn));
} else if (rend > end) {
/*
* @rgn intersects from above. Split and redo the
@@ -650,8 +508,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= end - rbase;
type->total_size -= end - rbase;
memblock_insert_region(type, i--, rbase, end - rbase,
- memblock_get_region_node(rgn),
- rgn->flags);
+ memblock_get_region_node(rgn));
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
@@ -663,8 +520,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
return 0;
}
-int __init_memblock memblock_remove_range(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size)
+static int __init_memblock __memblock_remove(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
int start_rgn, end_rgn;
int i, ret;
@@ -680,108 +537,43 @@ int __init_memblock memblock_remove_range(struct memblock_type *type,
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
- return memblock_remove_range(&memblock.memory, base, size);
+ return __memblock_remove(&memblock.memory, base, size);
}
-
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size - 1,
+ (unsigned long long)base + size,
(void *)_RET_IP_);
- kmemleak_free_part(__va(base), size);
- return memblock_remove_range(&memblock.reserved, base, size);
+ return __memblock_remove(&memblock.reserved, base, size);
}
-static int __init_memblock memblock_reserve_region(phys_addr_t base,
- phys_addr_t size,
- int nid,
- unsigned long flags)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
struct memblock_type *_rgn = &memblock.reserved;
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size - 1,
- flags, (void *)_RET_IP_);
-
- return memblock_add_range(_rgn, base, size, nid, flags);
-}
-
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
-{
- return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
-}
-
-/**
- * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
- * @base: the base phys addr of the region
- * @size: the size of the region
- *
- * This function isolates region [@base, @base + @size), and mark it with flag
- * MEMBLOCK_HOTPLUG.
- *
- * Return 0 on succees, -errno on failure.
- */
-int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
-{
- struct memblock_type *type = &memblock.memory;
- int i, ret, start_rgn, end_rgn;
-
- ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
- if (ret)
- return ret;
-
- for (i = start_rgn; i < end_rgn; i++)
- memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+ (unsigned long long)base + size,
+ (void *)_RET_IP_);
- memblock_merge_regions(type);
- return 0;
+ return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
}
/**
- * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
- * @base: the base phys addr of the region
- * @size: the size of the region
- *
- * This function isolates region [@base, @base + @size), and clear flag
- * MEMBLOCK_HOTPLUG for the isolated regions.
- *
- * Return 0 on succees, -errno on failure.
- */
-int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
-{
- struct memblock_type *type = &memblock.memory;
- int i, ret, start_rgn, end_rgn;
-
- ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
- if (ret)
- return ret;
-
- for (i = start_rgn; i < end_rgn; i++)
- memblock_clear_region_flags(&type->regions[i],
- MEMBLOCK_HOTPLUG);
-
- memblock_merge_regions(type);
- return 0;
-}
-
-/**
- * __next__mem_range - next function for for_each_free_mem_range() etc.
+ * __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: node selector, %NUMA_NO_NODE for all nodes
- * @type_a: pointer to memblock_type from where the range is taken
- * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
- * Find the first area from *@idx which matches @nid, fill the out
+ * Find the first free area from *@idx which matches @nid, fill the out
* parameters, and update *@idx for the next iteration. The lower 32bit of
- * *@idx contains index into type_a and the upper 32bit indexes the
- * areas before each region in type_b. For example, if type_b regions
+ * *@idx contains index into memory region and the upper 32bit indexes the
+ * areas before each reserved region. For example, if reserved regions
* look like the following,
*
* 0:[0-16), 1:[32-48), 2:[128-130)
@@ -793,77 +585,50 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
- struct memblock_type *type_a,
- struct memblock_type *type_b,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __init_memblock __next_free_mem_range(u64 *idx, int nid,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- int idx_a = *idx & 0xffffffff;
- int idx_b = *idx >> 32;
-
- if (WARN_ONCE(nid == MAX_NUMNODES,
- "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
- nid = NUMA_NO_NODE;
-
- for (; idx_a < type_a->cnt; idx_a++) {
- struct memblock_region *m = &type_a->regions[idx_a];
+ struct memblock_type *mem = &memblock.memory;
+ struct memblock_type *rsv = &memblock.reserved;
+ int mi = *idx & 0xffffffff;
+ int ri = *idx >> 32;
+ for ( ; mi < mem->cnt; mi++) {
+ struct memblock_region *m = &mem->regions[mi];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
- int m_nid = memblock_get_region_node(m);
/* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
+ if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
continue;
- if (!type_b) {
- if (out_start)
- *out_start = m_start;
- if (out_end)
- *out_end = m_end;
- if (out_nid)
- *out_nid = m_nid;
- idx_a++;
- *idx = (u32)idx_a | (u64)idx_b << 32;
- return;
- }
-
- /* scan areas before each reservation */
- for (; idx_b < type_b->cnt + 1; idx_b++) {
- struct memblock_region *r;
- phys_addr_t r_start;
- phys_addr_t r_end;
-
- r = &type_b->regions[idx_b];
- r_start = idx_b ? r[-1].base + r[-1].size : 0;
- r_end = idx_b < type_b->cnt ?
- r->base : ULLONG_MAX;
+ /* scan areas before each reservation for intersection */
+ for ( ; ri < rsv->cnt + 1; ri++) {
+ struct memblock_region *r = &rsv->regions[ri];
+ phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+ phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
- /*
- * if idx_b advanced past idx_a,
- * break out to advance idx_a
- */
+ /* if ri advanced past mi, break out to advance mi */
if (r_start >= m_end)
break;
/* if the two regions intersect, we're done */
if (m_start < r_end) {
if (out_start)
- *out_start =
- max(m_start, r_start);
+ *out_start = max(m_start, r_start);
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
- *out_nid = m_nid;
+ *out_nid = memblock_get_region_node(m);
/*
- * The region which ends first is
- * advanced for the next iteration.
+ * The region which ends first is advanced
+ * for the next iteration.
*/
if (m_end <= r_end)
- idx_a++;
+ mi++;
else
- idx_b++;
- *idx = (u32)idx_a | (u64)idx_b << 32;
+ ri++;
+ *idx = (u32)mi | (u64)ri << 32;
return;
}
}
@@ -874,80 +639,45 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
}
/**
- * __next_mem_range_rev - generic next function for for_each_*_range_rev()
- *
- * Finds the next range from type_a which is not marked as unsuitable
- * in type_b.
- *
+ * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
- * @type_a: pointer to memblock_type from where the range is taken
- * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
- * Reverse of __next_mem_range().
+ * Reverse of __next_free_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
- struct memblock_type *type_a,
- struct memblock_type *type_b,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- int idx_a = *idx & 0xffffffff;
- int idx_b = *idx >> 32;
-
- if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
- nid = NUMA_NO_NODE;
+ struct memblock_type *mem = &memblock.memory;
+ struct memblock_type *rsv = &memblock.reserved;
+ int mi = *idx & 0xffffffff;
+ int ri = *idx >> 32;
if (*idx == (u64)ULLONG_MAX) {
- idx_a = type_a->cnt - 1;
- idx_b = type_b->cnt;
+ mi = mem->cnt - 1;
+ ri = rsv->cnt;
}
- for (; idx_a >= 0; idx_a--) {
- struct memblock_region *m = &type_a->regions[idx_a];
-
+ for ( ; mi >= 0; mi--) {
+ struct memblock_region *m = &mem->regions[mi];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
- int m_nid = memblock_get_region_node(m);
/* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
- continue;
-
- /* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
continue;
- if (!type_b) {
- if (out_start)
- *out_start = m_start;
- if (out_end)
- *out_end = m_end;
- if (out_nid)
- *out_nid = m_nid;
- idx_a++;
- *idx = (u32)idx_a | (u64)idx_b << 32;
- return;
- }
-
- /* scan areas before each reservation */
- for (; idx_b >= 0; idx_b--) {
- struct memblock_region *r;
- phys_addr_t r_start;
- phys_addr_t r_end;
-
- r = &type_b->regions[idx_b];
- r_start = idx_b ? r[-1].base + r[-1].size : 0;
- r_end = idx_b < type_b->cnt ?
- r->base : ULLONG_MAX;
- /*
- * if idx_b advanced past idx_a,
- * break out to advance idx_a
- */
+ /* scan areas before each reservation for intersection */
+ for ( ; ri >= 0; ri--) {
+ struct memblock_region *r = &rsv->regions[ri];
+ phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+ phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+ /* if ri advanced past mi, break out to advance mi */
if (r_end <= m_start)
break;
/* if the two regions intersect, we're done */
@@ -957,17 +687,18 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
- *out_nid = m_nid;
+ *out_nid = memblock_get_region_node(m);
+
if (m_start >= r_start)
- idx_a--;
+ mi--;
else
- idx_b--;
- *idx = (u32)idx_a | (u64)idx_b << 32;
+ ri--;
+ *idx = (u32)mi | (u64)ri << 32;
return;
}
}
}
- /* signal end of iteration */
+
*idx = ULLONG_MAX;
}
@@ -1007,18 +738,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
* @size: size of area to set node ID for
- * @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
- struct memblock_type *type, int nid)
+ int nid)
{
+ struct memblock_type *type = &memblock.memory;
int start_rgn, end_rgn;
int i, ret;
@@ -1034,38 +765,20 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t max_addr,
+ int nid)
{
phys_addr_t found;
- if (!align)
- align = SMP_CACHE_BYTES;
+ /* align @size to avoid excessive fragmentation on reserved array */
+ size = round_up(size, align);
- found = memblock_find_in_range_node(size, align, start, end, nid);
- if (found && !memblock_reserve(found, size)) {
- /*
- * The min_count is set to 0 so that memblock allocations are
- * never reported as leaks.
- */
- kmemleak_alloc(__va(found), size, 0, 0);
+ found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+ if (found && !memblock_reserve(found, size))
return found;
- }
- return 0;
-}
-
-phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end)
-{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
-}
-static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t max_addr,
- int nid)
-{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ return 0;
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
@@ -1075,7 +788,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+ return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1105,207 +818,6 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
-/**
- * memblock_virt_alloc_internal - allocate boot memory block
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region to allocate (phys address)
- * @max_addr: the upper bound of the memory region to allocate (phys address)
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * The @min_addr limit is dropped if it can not be satisfied and the allocation
- * will fall back to memory below @min_addr. Also, allocation may fall back
- * to any node in the system if the specified node can not
- * hold the requested memory.
- *
- * The allocation is performed from memory region limited by
- * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
- *
- * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
- *
- * The phys address of allocated boot memory block is converted to virtual and
- * allocated memory is reset to 0.
- *
- * In addition, function sets the min_count to 0 using kmemleak_alloc for
- * allocated boot memory block, so that it is never reported as leaks.
- *
- * RETURNS:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
-static void * __init memblock_virt_alloc_internal(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
-{
- phys_addr_t alloc;
- void *ptr;
-
- if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
- nid = NUMA_NO_NODE;
-
- /*
- * Detect any accidental use of these APIs after slab is ready, as at
- * this moment memblock may be deinitialized already and its
- * internal data may be destroyed (after execution of free_all_bootmem)
- */
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, nid);
-
- if (!align)
- align = SMP_CACHE_BYTES;
-
- if (max_addr > memblock.current_limit)
- max_addr = memblock.current_limit;
-
-again:
- alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid);
- if (alloc)
- goto done;
-
- if (nid != NUMA_NO_NODE) {
- alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE);
- if (alloc)
- goto done;
- }
-
- if (min_addr) {
- min_addr = 0;
- goto again;
- } else {
- goto error;
- }
-
-done:
- memblock_reserve(alloc, size);
- ptr = phys_to_virt(alloc);
- memset(ptr, 0, size);
-
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks. This is because many of these blocks
- * are only referred via the physical address which is not
- * looked up by kmemleak.
- */
- kmemleak_alloc(ptr, size, 0, 0);
-
- return ptr;
-
-error:
- return NULL;
-}
-
-/**
- * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region from where the allocation
- * is preferred (phys address)
- * @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
- * allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
- *
- * RETURNS:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
-void * __init memblock_virt_alloc_try_nid_nopanic(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
-{
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
- return memblock_virt_alloc_internal(size, align, min_addr,
- max_addr, nid);
-}
-
-/**
- * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region from where the allocation
- * is preferred (phys address)
- * @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
- * allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
- * which provides debug information (including caller info), if enabled,
- * and panics if the request can not be satisfied.
- *
- * RETURNS:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
-void * __init memblock_virt_alloc_try_nid(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
-{
- void *ptr;
-
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr, (void *)_RET_IP_);
- ptr = memblock_virt_alloc_internal(size, align,
- min_addr, max_addr, nid);
- if (ptr)
- return ptr;
-
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
- __func__, (u64)size, (u64)align, nid, (u64)min_addr,
- (u64)max_addr);
- return NULL;
-}
-
-/**
- * __memblock_free_early - free boot memory block
- * @base: phys starting address of the boot memory block
- * @size: size of the boot memory block in bytes
- *
- * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
- */
-void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
-{
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
- kmemleak_free_part(__va(base), size);
- memblock_remove_range(&memblock.reserved, base, size);
-}
-
-/*
- * __memblock_free_late - free bootmem block pages directly to buddy allocator
- * @addr: phys starting address of the boot memory block
- * @size: size of the boot memory block in bytes
- *
- * This is only useful when the bootmem allocator has already been torn
- * down, but we are still initializing the system. Pages are released directly
- * to the buddy allocator, no bootmem metadata is updated because it is gone.
- */
-void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
-{
- u64 cursor, end;
-
- memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
- __func__, (u64)base, (u64)base + size - 1,
- (void *)_RET_IP_);
- kmemleak_free_part(__va(base), size);
- cursor = PFN_UP(base);
- end = PFN_DOWN(base + size);
-
- for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
- totalram_pages++;
- }
-}
/*
* Remaining API functions
@@ -1316,23 +828,6 @@ phys_addr_t __init memblock_phys_mem_size(void)
return memblock.memory.total_size;
}
-phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
-{
- unsigned long pages = 0;
- struct memblock_region *r;
- unsigned long start_pfn, end_pfn;
-
- for_each_memblock(memory, r) {
- start_pfn = memblock_region_memory_base_pfn(r);
- end_pfn = memblock_region_memory_end_pfn(r);
- start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
- end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
- pages += end_pfn - start_pfn;
- }
-
- return PFN_PHYS(pages);
-}
-
/* lowest address */
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
@@ -1348,14 +843,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
+ unsigned long i;
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
- struct memblock_region *r;
if (!limit)
return;
/* find out max address */
- for_each_memblock(memory, r) {
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ struct memblock_region *r = &memblock.memory.regions[i];
+
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1364,10 +861,8 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
}
/* truncate both memory and reserved regions */
- memblock_remove_range(&memblock.memory, max_addr,
- (phys_addr_t)ULLONG_MAX);
- memblock_remove_range(&memblock.reserved, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
+ __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
}
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -1398,23 +893,6 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
return memblock_search(&memblock.memory, addr) != -1;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- struct memblock_type *type = &memblock.memory;
- int mid = memblock_search(type, PFN_PHYS(pfn));
-
- if (mid == -1)
- return -1;
-
- *start_pfn = PFN_DOWN(type->regions[mid].base);
- *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
-
- return type->regions[mid].nid;
-}
-#endif
-
/**
* memblock_is_region_memory - check if a region is a subset of memory
* @base: base of region to check
@@ -1455,12 +933,13 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
void __init_memblock memblock_trim_memory(phys_addr_t align)
{
+ int i;
phys_addr_t start, end, orig_start, orig_end;
- struct memblock_region *r;
+ struct memblock_type *mem = &memblock.memory;
- for_each_memblock(memory, r) {
- orig_start = r->base;
- orig_end = r->base + r->size;
+ for (i = 0; i < mem->cnt; i++) {
+ orig_start = mem->regions[i].base;
+ orig_end = mem->regions[i].base + mem->regions[i].size;
start = round_up(orig_start, align);
end = round_down(orig_end, align);
@@ -1468,12 +947,11 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
continue;
if (start < end) {
- r->base = start;
- r->size = end - start;
+ mem->regions[i].base = start;
+ mem->regions[i].size = end - start;
} else {
- memblock_remove_region(&memblock.memory,
- r - memblock.memory.regions);
- r--;
+ memblock_remove_region(mem, i);
+ i--;
}
}
}
@@ -1483,15 +961,9 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
memblock.current_limit = limit;
}
-phys_addr_t __init_memblock memblock_get_current_limit(void)
-{
- return memblock.current_limit;
-}
-
static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
unsigned long long base, size;
- unsigned long flags;
int i;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1502,14 +974,13 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
- flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
- name, i, base, base + size - 1, size, nid_buf, flags);
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+ name, i, base, base + size - 1, size, nid_buf);
}
}
@@ -1580,9 +1051,6 @@ static int __init memblock_init_debugfs(void)
return -ENXIO;
debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
-#endif
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a2c7bcb0e6e..09255ec8159 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,29 +45,25 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
-#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmpressure.h>
+#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
-#include <linux/lockdep.h>
-#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
-#include "slab.h"
#include <asm/uaccess.h>
#include <trace/events/vmscan.h>
-struct cgroup_subsys memory_cgrp_subsys __read_mostly;
-EXPORT_SYMBOL(memory_cgrp_subsys);
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+EXPORT_SYMBOL(mem_cgroup_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -80,7 +76,7 @@ int do_swap_account __read_mostly;
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
-static int really_do_swap_account __initdata;
+static int really_do_swap_account __initdata = 0;
#endif
#else
@@ -88,12 +84,24 @@ static int really_do_swap_account __initdata;
#endif
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_NSTATS,
+};
+
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
- "rss_huge",
"mapped_file",
- "writeback",
"swap",
};
@@ -112,14 +120,6 @@ static const char * const mem_cgroup_events_names[] = {
"pgmajfault",
};
-static const char * const mem_cgroup_lru_names[] = {
- "inactive_anon",
- "active_anon",
- "inactive_file",
- "active_file",
- "unevictable",
-};
-
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
@@ -144,13 +144,8 @@ struct mem_cgroup_stat_cpu {
};
struct mem_cgroup_reclaim_iter {
- /*
- * last scanned hierarchy member. Valid only if last_dead_count
- * matches memcg->dead_count of the hierarchy root group.
- */
- struct mem_cgroup *last_visited;
- int last_dead_count;
-
+ /* css_id of the last scanned hierarchy member */
+ int position;
/* scan generation, increased every round-trip */
unsigned int generation;
};
@@ -176,6 +171,10 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
+struct mem_cgroup_lru_info {
+ struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+};
+
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -228,46 +227,6 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct mem_cgroup_event {
- /*
- * memcg which the event belongs to.
- */
- struct mem_cgroup *memcg;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * register_event() callback will be used to add new userspace
- * waiter for changes related to this event. Use eventfd_signal()
- * on eventfd to send notification to userspace.
- */
- int (*register_event)(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args);
- /*
- * unregister_event() callback will be called when userspace closes
- * the eventfd or on cgroup removing. This callback must be set,
- * if you want provide notification functionality.
- */
- void (*unregister_event)(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd);
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_t wait;
- struct work_struct remove;
-};
-
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -289,19 +248,45 @@ struct mem_cgroup {
*/
struct res_counter res;
- /* vmpressure notifications */
- struct vmpressure vmpressure;
+ union {
+ /*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
- /*
- * the counter to account for mem+swap usage.
- */
- struct res_counter memsw;
+ /*
+ * rcu_freeing is used only when freeing struct mem_cgroup,
+ * so put it into a union to avoid wasting more memory.
+ * It must be disjoint from the css field. It could be
+ * in a union with the res field, but res plays a much
+ * larger part in mem_cgroup life than memsw, and might
+ * be of interest, even at time of free, when debugging.
+ * So share rcu_head with the less interesting memsw.
+ */
+ struct rcu_head rcu_freeing;
+ /*
+ * We also need some space for a worker in deferred freeing.
+ * By the time we call it, rcu_freeing is no longer in use.
+ */
+ struct work_struct work_freeing;
+ };
/*
* the counter to account for kernel memory usage.
*/
struct res_counter kmem;
/*
+ * Per cgroup active and inactive list, similar to the
+ * per zone LRU lists.
+ */
+ struct mem_cgroup_lru_info info;
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+ /*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
@@ -309,7 +294,8 @@ struct mem_cgroup {
bool oom_lock;
atomic_t under_oom;
- atomic_t oom_wakeups;
+
+ atomic_t refcnt;
int swappiness;
/* OOM-Killer disable */
@@ -334,7 +320,7 @@ struct mem_cgroup {
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
- unsigned long move_charge_at_immigrate;
+ unsigned long move_charge_at_immigrate;
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
@@ -352,39 +338,30 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
- atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
+ struct tcp_memcontrol tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list, but per-memcg;
- * protected by memcg_slab_mutex */
+ /* analogous to slab_common's slab_caches list. per-memcg */
struct list_head memcg_slab_caches;
+ /* Not a spinlock, we can take a lot of time walking the list */
+ struct mutex slab_caches_mutex;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
};
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+ ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -396,13 +373,18 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+ clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
- /*
- * Our caller must use css_get() first, because memcg_uncharge_kmem()
- * will call css_put() if it sees the memcg is dead.
- */
- smp_wmb();
if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}
@@ -416,8 +398,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
/* Stuffs for move charges at task migration. */
/*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
+ * left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
@@ -430,7 +412,6 @@ static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
- unsigned long immigrate_flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
@@ -443,12 +424,14 @@ static struct move_charge_struct {
static bool move_anon(void)
{
- return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
+ return test_bit(MOVE_CHARGE_TYPE_ANON,
+ &mc.to->move_charge_at_immigrate);
}
static bool move_file(void)
{
- return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
+ return test_bit(MOVE_CHARGE_TYPE_FILE,
+ &mc.to->move_charge_at_immigrate);
}
/*
@@ -488,29 +471,13 @@ enum res_type {
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
+static void mem_cgroup_get(struct mem_cgroup *memcg);
+static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
-/* Some nice accessors for the vmpressure. */
-struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
-{
- if (!memcg)
- memcg = root_mem_cgroup;
- return &memcg->vmpressure;
-}
-
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
-{
- return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+ return container_of(s, struct mem_cgroup, css);
}
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
@@ -518,25 +485,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
@@ -558,16 +506,15 @@ void sock_update_memcg(struct sock *sk)
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
- css_get(&sk->sk_cgrp->memcg->css);
+ mem_cgroup_get(sk->sk_cgrp->memcg);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
- css_tryget_online(&memcg->css)) {
+ if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
+ mem_cgroup_get(memcg);
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
@@ -581,7 +528,7 @@ void sock_release_memcg(struct sock *sk)
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
- css_put(&sk->sk_cgrp->memcg->css);
+ mem_cgroup_put(memcg);
}
}
@@ -590,13 +537,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem;
+ return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem))
+ if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -609,11 +556,16 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * The main reason for not using cgroup id for this:
- * this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
+ * There are two main reasons for not using the css_id for this:
+ * 1) this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * 2) In order not to violate the cgroup API, we would like to do all memory
+ * allocation in ->create(). At that point, we haven't yet allocated the
+ * css_id. Having a separate index prevents us from messing with the cgroup
+ * core for this
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -628,14 +580,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * cgrp_id space is not getting any smaller, and we don't have to necessarily
+ * css_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
+#define MEMCG_CACHES_MAX_SIZE 65535
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -673,12 +625,9 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
-mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
+mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
-
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
+ return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
}
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -687,12 +636,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
}
static struct mem_cgroup_per_zone *
-mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
+ return mem_cgroup_zoneinfo(memcg, nid, zid);
}
static struct mem_cgroup_tree_per_zone *
@@ -710,9 +659,11 @@ soft_limit_tree_from_page(struct page *page)
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
-static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -742,8 +693,10 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = true;
}
-static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
@@ -751,11 +704,13 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = false;
}
-static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+ struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
spin_unlock(&mctz->lock);
}
@@ -765,14 +720,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
-
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
+
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_page_zoneinfo(memcg, page);
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
@@ -782,12 +739,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(memcg, mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
@@ -795,15 +752,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
- struct mem_cgroup_tree_per_zone *mctz;
+ int node, zone;
struct mem_cgroup_per_zone *mz;
- int nid, zid;
+ struct mem_cgroup_tree_per_zone *mctz;
- for_each_node(nid) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- mctz = soft_limit_tree_node_zone(nid, zid);
- mem_cgroup_remove_exceeded(mz, mctz);
+ for_each_node(node) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = mem_cgroup_zoneinfo(memcg, node, zone);
+ mctz = soft_limit_tree_node_zone(node, zone);
+ mem_cgroup_remove_exceeded(memcg, mz, mctz);
}
}
}
@@ -826,9 +783,9 @@ retry:
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
- !css_tryget_online(&mz->memcg->css))
+ !css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
@@ -895,7 +852,6 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
- get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
@@ -903,14 +859,14 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
- put_online_cpus();
return val;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
- struct page *page,
bool anon, int nr_pages)
{
+ preempt_disable();
+
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
@@ -922,10 +878,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
- nr_pages);
-
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
@@ -935,9 +887,12 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+
+ preempt_enable();
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
@@ -945,38 +900,46 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return mz->lru_size[lru];
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+static unsigned long
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
+ unsigned int lru_mask)
{
- unsigned long nr = 0;
- int zid;
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list lru;
+ unsigned long ret = 0;
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- nr += mz->lru_size[lru];
- }
+ for_each_lru(lru) {
+ if (BIT(lru) & lru_mask)
+ ret += mz->lru_size[lru];
}
- return nr;
+ return ret;
+}
+
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ u64 total = 0;
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ total += mem_cgroup_zone_nr_lru_pages(memcg,
+ nid, zid, lru_mask);
+
+ return total;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
- unsigned long nr = 0;
int nid;
+ u64 total = 0;
for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
+ total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
+ return total;
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -1039,6 +1002,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
preempt_enable();
}
+struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+{
+ return mem_cgroup_from_css(
+ cgroup_subsys_state(cont, mem_cgroup_subsys_id));
+}
+
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
@@ -1049,137 +1018,30 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
if (unlikely(!p))
return NULL;
- return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
+ return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
}
-static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
+ if (!mm)
+ return NULL;
+ /*
+ * Because we have no locks, mm->owner's may be being moved to other
+ * cgroup. We use css_tryget() here even if this looks
+ * pessimistic (rather than adding locks here).
+ */
rcu_read_lock();
do {
- /*
- * Page cache insertions can happen withou an
- * actual mm context, e.g. during disk probing
- * on boot, loopback IO, acct() writes etc.
- */
- if (unlikely(!mm))
- memcg = root_mem_cgroup;
- else {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- memcg = root_mem_cgroup;
- }
- } while (!css_tryget_online(&memcg->css));
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ break;
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
-/*
- * Returns a next (in a pre-order walk) alive memcg (with elevated css
- * ref. count) or NULL if the whole root's subtree has been visited.
- *
- * helper function to be used by mem_cgroup_iter
- */
-static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
- struct mem_cgroup *last_visited)
-{
- struct cgroup_subsys_state *prev_css, *next_css;
-
- prev_css = last_visited ? &last_visited->css : NULL;
-skip_node:
- next_css = css_next_descendant_pre(prev_css, &root->css);
-
- /*
- * Even if we found a group we have to make sure it is
- * alive. css && !memcg means that the groups should be
- * skipped and we should continue the tree walk.
- * last_visited css is safe to use because it is
- * protected by css_get and the tree walk is rcu safe.
- *
- * We do not take a reference on the root of the tree walk
- * because we might race with the root removal when it would
- * be the only node in the iterated hierarchy and mem_cgroup_iter
- * would end up in an endless loop because it expects that at
- * least one valid node will be returned. Root cannot disappear
- * because caller of the iterator should hold it already so
- * skipping css reference should be safe.
- */
- if (next_css) {
- if ((next_css == &root->css) ||
- ((next_css->flags & CSS_ONLINE) &&
- css_tryget_online(next_css)))
- return mem_cgroup_from_css(next_css);
-
- prev_css = next_css;
- goto skip_node;
- }
-
- return NULL;
-}
-
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
-{
- /*
- * When a group in the hierarchy below root is destroyed, the
- * hierarchy iterator can no longer be trusted since it might
- * have pointed to the destroyed group. Invalidate it.
- */
- atomic_inc(&root->dead_count);
-}
-
-static struct mem_cgroup *
-mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *root,
- int *sequence)
-{
- struct mem_cgroup *position = NULL;
- /*
- * A cgroup destruction happens in two stages: offlining and
- * release. They are separated by a RCU grace period.
- *
- * If the iterator is valid, we may still race with an
- * offlining. The RCU lock ensures the object won't be
- * released, tryget will fail if we lost the race.
- */
- *sequence = atomic_read(&root->dead_count);
- if (iter->last_dead_count == *sequence) {
- smp_rmb();
- position = iter->last_visited;
-
- /*
- * We cannot take a reference to root because we might race
- * with root removal and returning NULL would end up in
- * an endless loop on the iterator user level when root
- * would be returned all the time.
- */
- if (position && position != root &&
- !css_tryget_online(&position->css))
- position = NULL;
- }
- return position;
-}
-
-static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
- struct mem_cgroup *last_visited,
- struct mem_cgroup *new_position,
- struct mem_cgroup *root,
- int sequence)
-{
- /* root reference counting symmetric to mem_cgroup_iter_load */
- if (last_visited && last_visited != root)
- css_put(&last_visited->css);
- /*
- * We store the sequence count from the time @last_visited was
- * loaded successfully instead of rereading it here so that we
- * don't lose destruction events in between. We could have
- * raced with the destruction of @new_position after all.
- */
- iter->last_visited = new_position;
- smp_wmb();
- iter->last_dead_count = sequence;
-}
-
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1202,7 +1064,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *last_visited = NULL;
+ int id = 0;
if (mem_cgroup_disabled())
return NULL;
@@ -1211,53 +1073,53 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
if (prev && !reclaim)
- last_visited = prev;
+ id = css_id(&prev->css);
+
+ if (prev && prev != root)
+ css_put(&prev->css);
if (!root->use_hierarchy && root != root_mem_cgroup) {
if (prev)
- goto out_css_put;
+ return NULL;
return root;
}
- rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
- int uninitialized_var(seq);
+ struct cgroup_subsys_state *css;
if (reclaim) {
+ int nid = zone_to_nid(reclaim->zone);
+ int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
- mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
- if (prev && reclaim->generation != iter->generation) {
- iter->last_visited = NULL;
- goto out_unlock;
- }
-
- last_visited = mem_cgroup_iter_load(iter, root, &seq);
+ if (prev && reclaim->generation != iter->generation)
+ return NULL;
+ id = iter->position;
}
- memcg = __mem_cgroup_iter_next(root, last_visited);
+ rcu_read_lock();
+ css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
+ if (css) {
+ if (css == &root->css || css_tryget(css))
+ memcg = mem_cgroup_from_css(css);
+ } else
+ id = 0;
+ rcu_read_unlock();
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, root,
- seq);
-
- if (!memcg)
+ iter->position = id;
+ if (!css)
iter->generation++;
else if (!prev && memcg)
reclaim->generation = iter->generation;
}
- if (prev && !memcg)
- goto out_unlock;
+ if (prev && !css)
+ return NULL;
}
-out_unlock:
- rcu_read_unlock();
-out_css_put:
- if (prev && prev != root)
- css_put(&prev->css);
-
return memcg;
}
@@ -1334,7 +1196,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
goto out;
}
- mz = mem_cgroup_zone_zoneinfo(memcg, zone);
+ mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
lruvec = &mz->lruvec;
out:
/*
@@ -1393,7 +1255,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
- mz = mem_cgroup_page_zoneinfo(memcg, page);
+ mz = page_cgroup_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
@@ -1441,7 +1303,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
+ return css_is_ancestor(&memcg->css, &root_memcg->css);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -1455,16 +1317,15 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return ret;
}
-bool task_in_mem_cgroup(struct task_struct *task,
- const struct mem_cgroup *memcg)
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
{
+ int ret;
struct mem_cgroup *curr = NULL;
struct task_struct *p;
- bool ret;
p = find_lock_task_mm(task);
if (p) {
- curr = get_mem_cgroup_from_mm(p->mm);
+ curr = try_get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1472,12 +1333,14 @@ bool task_in_mem_cgroup(struct task_struct *task,
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
- rcu_read_lock();
+ task_lock(task);
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
- rcu_read_unlock();
+ task_unlock(task);
}
+ if (!curr)
+ return 0;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1508,6 +1371,17 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
+{
+ unsigned long active;
+ unsigned long inactive;
+
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
+
+ return (active > inactive);
+}
+
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1530,8 +1404,10 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
+ struct cgroup *cgrp = memcg->css.cgroup;
+
/* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
+ if (cgrp->parent == NULL)
return vm_swappiness;
return memcg->swappiness;
@@ -1575,12 +1451,23 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
}
/*
- * A routine for checking "mem" is under move_account() or not.
+ * 2 routines for checking "mem" is under move_account() or not.
*
- * Checking a cgroup is mc.from or mc.to or under hierarchy of
- * moving cgroups. This is for waiting at high-memory pressure
- * caused by "move".
+ * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
+ * is used for avoiding races in accounting. If true,
+ * pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ * under hierarchy of moving cgroups. This is for
+ * waiting at hith-memory prressure caused by "move".
*/
+
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return atomic_read(&memcg->moving_account) > 0;
+}
+
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
@@ -1623,6 +1510,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
+ * see mem_cgroup_stolen(), too.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
@@ -1636,9 +1524,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
spin_unlock_irqrestore(&memcg->move_lock, *flags);
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
@@ -1647,57 +1534,64 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /* oom_info_lock ensures that parallel ooms do not interleave */
- static DEFINE_MUTEX(oom_info_lock);
- struct mem_cgroup *iter;
- unsigned int i;
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
+ /*
+ * Need a buffer in BSS, can't rely on allocations. The code relies
+ * on the assumption that OOM is serialized for memory controller.
+ * If this assumption is broken, revisit this code.
+ */
+ static char memcg_name[PATH_MAX];
+ int ret;
- if (!p)
+ if (!memcg || !p)
return;
- mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_info(" killed as a result of limit of ");
- pr_cont_cgroup_path(memcg->css.cgroup);
- pr_info("\n");
+ mem_cgrp = memcg->css.cgroup;
+ task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+
+ ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
+ if (ret < 0) {
+ /*
+ * Unfortunately, we are unable to convert to a useful name
+ * But we'll still print out the usage information
+ */
+ rcu_read_unlock();
+ goto done;
+ }
+ rcu_read_unlock();
+
+ printk(KERN_INFO "Task in %s killed", memcg_name);
+ rcu_read_lock();
+ ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
+ if (ret < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
rcu_read_unlock();
- pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
+ /*
+ * Continues from above, so we don't need an KERN_ level
+ */
+ printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+done:
+
+ printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
+ printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
+ "failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+ printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
- K(mem_cgroup_read_stat(iter, i)));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
-
- pr_cont("\n");
- }
- mutex_unlock(&oom_info_lock);
}
/*
@@ -1752,11 +1646,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct task_struct *chosen = NULL;
/*
- * If current has a pending SIGKILL or is exiting, then automatically
- * select it. The goal is to allow it to allocate so that it may
- * quickly exit and free its memory.
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
*/
- if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -1764,11 +1658,12 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
- struct css_task_iter it;
+ struct cgroup *cgroup = iter->css.cgroup;
+ struct cgroup_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, &it);
- while ((task = css_task_iter_next(&it))) {
+ cgroup_iter_start(cgroup, &it);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
switch (oom_scan_process_thread(task, totalpages, NULL,
false)) {
case OOM_SCAN_SELECT:
@@ -1781,7 +1676,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
case OOM_SCAN_CONTINUE:
continue;
case OOM_SCAN_ABORT:
- css_task_iter_end(&it);
+ cgroup_iter_end(cgroup, &it);
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
@@ -1790,20 +1685,15 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (!points || points < chosen_points)
- continue;
- /* Prefer thread group leaders for display purposes */
- if (points == chosen_points &&
- thread_group_leader(chosen))
- continue;
-
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
+ if (points > chosen_points) {
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
+ }
}
- css_task_iter_end(&it);
+ cgroup_iter_end(cgroup, &it);
}
if (!chosen)
@@ -2039,24 +1929,15 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
return total;
}
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map memcg_oom_lock_dep_map = {
- .name = "memcg_oom_lock",
-};
-#endif
-
-static DEFINE_SPINLOCK(memcg_oom_lock);
-
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
+ * Has to be called with memcg_oom_lock
*/
-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter, *failed = NULL;
- spin_lock(&memcg_oom_lock);
-
for_each_mem_cgroup_tree(iter, memcg) {
if (iter->oom_lock) {
/*
@@ -2070,35 +1951,33 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
iter->oom_lock = true;
}
- if (failed) {
- /*
- * OK, we failed to lock the whole subtree so we have
- * to clean up what we set up to the failing subtree
- */
- for_each_mem_cgroup_tree(iter, memcg) {
- if (iter == failed) {
- mem_cgroup_iter_break(memcg, iter);
- break;
- }
- iter->oom_lock = false;
- }
- } else
- mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
-
- spin_unlock(&memcg_oom_lock);
+ if (!failed)
+ return true;
- return !failed;
+ /*
+ * OK, we failed to lock the whole subtree so we have to clean up
+ * what we set up to the failing subtree
+ */
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == failed) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ iter->oom_lock = false;
+ }
+ return false;
}
-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+/*
+ * Has to be called with memcg_oom_lock
+ */
+static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
- spin_lock(&memcg_oom_lock);
- mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
- spin_unlock(&memcg_oom_lock);
+ return 0;
}
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2122,6 +2001,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
atomic_add_unless(&iter->under_oom, -1, 0);
}
+static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
@@ -2151,7 +2031,6 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
- atomic_inc(&memcg->oom_wakeups);
/* for filtering, pass "memcg" as argument. */
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
@@ -2162,106 +2041,67 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
memcg_wakeup_oom(memcg);
}
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
-{
- if (!current->memcg_oom.may_oom)
- return;
- /*
- * We are in the middle of the charge context here, so we
- * don't want to block when potentially sitting on a callstack
- * that holds all kinds of filesystem and mm locks.
- *
- * Also, the caller may handle a failed allocation gracefully
- * (like optional page cache readahead) and so an OOM killer
- * invocation might not even be necessary.
- *
- * That's why we don't do anything here except remember the
- * OOM context and then deal with it at the end of the page
- * fault when the stack is unwound, the locks are released,
- * and when we know whether the fault was overall successful.
- */
- css_get(&memcg->css);
- current->memcg_oom.memcg = memcg;
- current->memcg_oom.gfp_mask = mask;
- current->memcg_oom.order = order;
-}
-
-/**
- * mem_cgroup_oom_synchronize - complete memcg OOM handling
- * @handle: actually kill/wait or just clean up the OOM state
- *
- * This has to be called at the end of a page fault if the memcg OOM
- * handler was enabled.
- *
- * Memcg supports userspace OOM handling where failed allocations must
- * sleep on a waitqueue until the userspace task resolves the
- * situation. Sleeping directly in the charge context with all kinds
- * of locks held is not a good idea, instead we remember an OOM state
- * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to complete the OOM handling.
- *
- * Returns %true if an ongoing memcg OOM situation was detected and
- * completed, %false otherwise.
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
-bool mem_cgroup_oom_synchronize(bool handle)
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+ int order)
{
- struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
- bool locked;
-
- /* OOM is global, do not handle */
- if (!memcg)
- return false;
-
- if (!handle)
- goto cleanup;
+ bool locked, need_to_kill;
owait.memcg = memcg;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
-
- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ need_to_kill = true;
mem_cgroup_mark_under_oom(memcg);
- locked = mem_cgroup_oom_trylock(memcg);
-
+ /* At first, try to OOM lock hierarchy under memcg.*/
+ spin_lock(&memcg_oom_lock);
+ locked = mem_cgroup_oom_lock(memcg);
+ /*
+ * Even if signal_pending(), we can't quit charge() loop without
+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+ * under OOM is always welcomed, use TASK_KILLABLE here.
+ */
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ if (!locked || memcg->oom_kill_disable)
+ need_to_kill = false;
if (locked)
mem_cgroup_oom_notify(memcg);
+ spin_unlock(&memcg_oom_lock);
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
+ if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
- current->memcg_oom.order);
+ mem_cgroup_out_of_memory(memcg, mask, order);
} else {
schedule();
- mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
}
-
- if (locked) {
+ spin_lock(&memcg_oom_lock);
+ if (locked)
mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- }
-cleanup:
- current->memcg_oom.memcg = NULL;
- css_put(&memcg->css);
+ memcg_wakeup_oom(memcg);
+ spin_unlock(&memcg_oom_lock);
+
+ mem_cgroup_unmark_under_oom(memcg);
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;
+ /* Give chance to dying process */
+ schedule_timeout_uninterruptible(1);
return true;
}
/*
- * Used to update mapped file or writeback or other statistics.
+ * Currently used to update mapped file statistics, but the routine can be
+ * generalized to update other statistics as well.
*
* Notes: Race condition
*
- * We usually use lock_page_cgroup() for accessing page_cgroup member but
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
@@ -2275,8 +2115,8 @@ cleanup:
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
- * small, we check memcg->moving_account and detect there are possibility
- * of race or not. If there is, we take a lock.
+ * small, we check mm->moving_account and detect there are possibility of race
+ * If there is, we take a lock.
*/
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2294,10 +2134,9 @@ again:
* If this memory cgroup is not under account moving, we don't
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock().
+ * rcu_read_unlock() if mem_cgroup_stolen() == true.
*/
- VM_BUG_ON(!rcu_read_lock_held());
- if (atomic_read(&memcg->moving_account) <= 0)
+ if (!mem_cgroup_stolen(memcg))
return;
move_lock_mem_cgroup(memcg, flags);
@@ -2321,7 +2160,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
}
void mem_cgroup_update_page_stat(struct page *page,
- enum mem_cgroup_stat_index idx, int val)
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2330,11 +2169,18 @@ void mem_cgroup_update_page_stat(struct page *page,
if (mem_cgroup_disabled())
return;
- VM_BUG_ON(!rcu_read_lock_held());
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
return;
+ switch (idx) {
+ case MEMCG_NR_FILE_MAPPED:
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
+ break;
+ default:
+ BUG();
+ }
+
this_cpu_add(memcg->stat->count[idx], val);
}
@@ -2405,22 +2251,11 @@ static void drain_stock(struct memcg_stock_pcp *stock)
*/
static void drain_local_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
+ struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
-static void __init memcg_stock_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct memcg_stock_pcp *stock =
- &per_cpu(memcg_stock, cpu);
- INIT_WORK(&stock->work, drain_local_stock);
- }
-}
-
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
@@ -2476,7 +2311,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
flush_work(&stock->work);
}
out:
- put_online_cpus();
+ put_online_cpus();
}
/*
@@ -2529,7 +2364,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
spin_unlock(&memcg->pcp_counter_lock);
}
-static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
@@ -2552,17 +2387,18 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
}
-/* See mem_cgroup_try_charge() for details */
+/* See __mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages, unsigned int min_pages,
- bool invoke_oom)
+ bool oom_check)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
@@ -2619,117 +2455,171 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_wait_acct_move(mem_over_limit))
return CHARGE_RETRY;
- if (invoke_oom)
- mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
+ return CHARGE_OOM_DIE;
- return CHARGE_NOMEM;
+ return CHARGE_RETRY;
}
-/**
- * mem_cgroup_try_charge - try charging a memcg
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
+/*
+ * __mem_cgroup_try_charge() does
+ * 1. detect memcg to be charged against from passed *mm and *ptr,
+ * 2. update res_counter
+ * 3. call memory reclaim if necessary.
+ *
+ * In some special case, if the task is fatal, fatal_signal_pending() or
+ * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
+ * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
+ * as possible without any hazards. 2: all pages should have a valid
+ * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
+ * pointer, that is treated as a charge to root_mem_cgroup.
*
- * Returns 0 if @memcg was charged successfully, -EINTR if the charge
- * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
+ * So __mem_cgroup_try_charge() will return
+ * 0 ... on success, filling *ptr with a valid memcg pointer.
+ * -ENOMEM ... charge failure because of resource limits.
+ * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+ *
+ * Unlike the exported interface, an "oom" parameter is added. if oom==true,
+ * the oom-killer can be invoked.
*/
-static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- bool oom)
+static int __mem_cgroup_try_charge(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ struct mem_cgroup **ptr,
+ bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *memcg = NULL;
int ret;
- if (mem_cgroup_is_root(memcg))
- goto done;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+ * in system level. So, allow to go ahead dying process in addition to
+ * MEMDIE process.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
+ || fatal_signal_pending(current)))
goto bypass;
- if (unlikely(task_in_memcg_oom(current)))
- goto nomem;
-
- if (gfp_mask & __GFP_NOFAIL)
- oom = false;
+ /*
+ * We always charge the cgroup the mm_struct belongs to.
+ * The mm_struct's mem_cgroup changes on task migration if the
+ * thread group leader migrates. It's possible that mm is not
+ * set, if so charge the root memcg (happens for pagecache usage).
+ */
+ if (!*ptr && !mm)
+ *ptr = root_mem_cgroup;
again:
- if (consume_stock(memcg, nr_pages))
- goto done;
+ if (*ptr) { /* css should be a valid one */
+ memcg = *ptr;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
+ if (consume_stock(memcg, nr_pages))
+ goto done;
+ css_get(&memcg->css);
+ } else {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = rcu_dereference(mm->owner);
+ /*
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "memcg" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
+ */
+ memcg = mem_cgroup_from_task(p);
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ if (mem_cgroup_is_root(memcg)) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (consume_stock(memcg, nr_pages)) {
+ /*
+ * It seems dagerous to access memcg without css_get().
+ * But considering how consume_stok works, it's not
+ * necessary. If consume_stock success, some charges
+ * from this memcg are cached on this cpu. So, we
+ * don't need to call css_get()/css_tryget() before
+ * calling consume_stock().
+ */
+ rcu_read_unlock();
+ goto done;
+ }
+ /* after here, we may be blocked. we need to get refcnt */
+ if (!css_tryget(&memcg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+ }
do {
- bool invoke_oom = oom && !nr_oom_retries;
+ bool oom_check;
/* If killed, bypass charge */
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current)) {
+ css_put(&memcg->css);
goto bypass;
+ }
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
- nr_pages, invoke_oom);
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ }
+
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+ oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
+ css_put(&memcg->css);
+ memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom)
+ if (!oom) {
+ css_put(&memcg->css);
goto nomem;
+ }
+ /* If oom, we never return -ENOMEM */
nr_oom_retries--;
break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+ css_put(&memcg->css);
+ goto bypass;
}
} while (ret != CHARGE_OK);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ css_put(&memcg->css);
done:
+ *ptr = memcg;
return 0;
nomem:
- if (!(gfp_mask & __GFP_NOFAIL))
- return -ENOMEM;
+ *ptr = NULL;
+ return -ENOMEM;
bypass:
+ *ptr = root_mem_cgroup;
return -EINTR;
}
-/**
- * mem_cgroup_try_charge_mm - try charging a mm
- * @mm: mm_struct to charge
- * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
- *
- * Returns the charged mem_cgroup associated with the given mm_struct or
- * NULL the charge failed.
- */
-static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- bool oom)
-
-{
- struct mem_cgroup *memcg;
- int ret;
-
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
- css_put(&memcg->css);
- if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- memcg = NULL;
-
- return memcg;
-}
-
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
@@ -2767,16 +2657,21 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
+ * rcu_read_lock(). The caller is responsible for calling css_tryget if
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
+ struct cgroup_subsys_state *css;
+
/* ID 0 is unused ID */
if (!id)
return NULL;
- return mem_cgroup_from_id(id);
+ css = css_lookup(&mem_cgroup_subsys, id);
+ if (!css)
+ return NULL;
+ return mem_cgroup_from_css(css);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2786,20 +2681,20 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
- if (memcg && !css_tryget_online(&memcg->css))
+ if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
- if (memcg && !css_tryget_online(&memcg->css))
+ if (memcg && !css_tryget(&memcg->css))
memcg = NULL;
rcu_read_unlock();
}
@@ -2820,7 +2715,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
+ VM_BUG_ON(PageCgroupUsed(pc));
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2848,14 +2743,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
- */
+ */
smp_wmb();
SetPageCgroupUsed(pc);
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2867,7 +2762,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
else
anon = false;
- mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
+ mem_cgroup_charge_statistics(memcg, anon, nr_pages);
unlock_page_cgroup(pc);
/*
@@ -2881,18 +2776,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-static DEFINE_MUTEX(activate_kmem_mutex);
-
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- memcg_kmem_is_active(memcg);
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
}
/*
@@ -2905,13 +2792,14 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
}
#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
@@ -2919,10 +2807,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
print_slabinfo_header(m);
- mutex_lock(&memcg_slab_mutex);
+ mutex_lock(&memcg->slab_caches_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list)
cache_show(memcg_params_to_cache(params), m);
- mutex_unlock(&memcg_slab_mutex);
+ mutex_unlock(&memcg->slab_caches_mutex);
return 0;
}
@@ -2931,17 +2819,27 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
+ struct mem_cgroup *_memcg;
int ret = 0;
+ bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
- oom_gfp_allowed(gfp));
+ /*
+ * Conditions under which we can wait for the oom_killer. Those are
+ * the same conditions tested by the core page allocator
+ */
+ may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+
+ _memcg = memcg;
+ ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+ &_memcg, may_oom);
+
if (ret == -EINTR) {
/*
- * mem_cgroup_try_charge() chosed to bypass to root due to
+ * __mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
@@ -2951,7 +2849,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
- * mem_cgroup_try_charge() above. Tasks that were already
+ * __mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
@@ -2976,16 +2874,18 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
if (res_counter_uncharge(&memcg->kmem, size))
return;
- /*
- * Releases a reference taken in kmem_cgroup_css_offline in case
- * this last uncharge is racing with the offlining code or it is
- * outliving the memcg existence.
- *
- * The memory barrier imposed by test&clear is paired with the
- * explicit one in memcg_kmem_mark_dead().
- */
if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
+ mem_cgroup_put(memcg);
+}
+
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+ if (!memcg)
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
}
/*
@@ -2998,6 +2898,43 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+ int num, ret;
+
+ num = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (num < 0)
+ return num;
+ /*
+ * After this point, kmem_accounted (that we test atomically in
+ * the beginning of this conditional), is no longer 0. This
+ * guarantees only one process will set the following boolean
+ * to true. We don't need test_and_set because we're protected
+ * by the set_limit_mutex anyway.
+ */
+ memcg_kmem_set_activated(memcg);
+
+ ret = memcg_update_all_caches(num+1);
+ if (ret) {
+ ida_simple_remove(&kmem_limited_groups, num);
+ memcg_kmem_clear_activated(memcg);
+ return ret;
+ }
+
+ memcg->kmemcg_id = num;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+ return 0;
+}
+
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3028,21 +2965,22 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(!is_root_cache(s));
+ VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
if (num_groups > memcg_limited_groups_array_size) {
int i;
- struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
- size += offsetof(struct memcg_cache_params, memcg_caches);
+ size += sizeof(struct memcg_cache_params);
- new_params = kzalloc(size, GFP_KERNEL);
- if (!new_params)
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params) {
+ s->memcg_params = cur_params;
return -ENOMEM;
+ }
- new_params->is_root_cache = true;
+ s->memcg_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3056,7 +2994,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- new_params->memcg_caches[i] =
+ s->memcg_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3069,26 +3007,21 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- rcu_assign_pointer(s->memcg_params, new_params);
- if (cur_params)
- kfree_rcu(cur_params, rcu_head);
+ kfree(cur_params);
}
return 0;
}
-int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
- size_t size;
+ size_t size = sizeof(struct memcg_cache_params);
if (!memcg_kmem_enabled())
return 0;
- if (!memcg) {
- size = offsetof(struct memcg_cache_params, memcg_caches);
+ if (!memcg)
size += memcg_limited_groups_array_size * sizeof(void *);
- } else
- size = sizeof(struct memcg_cache_params);
s->memcg_params = kzalloc(size, GFP_KERNEL);
if (!s->memcg_params)
@@ -3097,85 +3030,39 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
- css_get(&memcg->css);
- } else
- s->memcg_params->is_root_cache = true;
-
+ }
return 0;
}
-void memcg_free_cache_params(struct kmem_cache *s)
-{
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- css_put(&s->memcg_params->memcg->css);
- kfree(s->memcg_params);
-}
-
-static void memcg_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
+void memcg_release_cache(struct kmem_cache *s)
{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by
- memcg_slab_mutex */
- struct kmem_cache *cachep;
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
int id;
- lockdep_assert_held(&memcg_slab_mutex);
-
- id = memcg_cache_id(memcg);
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (cache_from_memcg_idx(root_cache, id))
- return;
-
- cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
/*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
+ * This happens, for instance, when a root cache goes away before we
+ * add any memcg.
*/
- if (!cachep)
+ if (!s->memcg_params)
return;
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
- /*
- * Since readers won't lock (see cache_from_memcg_idx()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id]);
- root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
- struct kmem_cache *root_cache;
- struct mem_cgroup *memcg;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- BUG_ON(is_root_cache(cachep));
+ if (s->memcg_params->is_root_cache)
+ goto out;
- root_cache = cachep->memcg_params->root_cache;
- memcg = cachep->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- root_cache->memcg_params->memcg_caches[id] = NULL;
+ root = s->memcg_params->root_cache;
+ root->memcg_params->memcg_caches[id] = NULL;
+ mem_cgroup_put(memcg);
- list_del(&cachep->memcg_params->list);
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_del(&s->memcg_params->list);
+ mutex_unlock(&memcg->slab_caches_mutex);
- kmem_cache_destroy(cachep);
+out:
+ kfree(s->memcg_params);
}
/*
@@ -3209,93 +3096,266 @@ static inline void memcg_resume_kmem_account(void)
current->memcg_kmem_skip_account--;
}
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
+static void kmem_cache_destroy_work_func(struct work_struct *w)
{
- struct kmem_cache *c;
- int i, failed = 0;
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *p;
- mutex_lock(&memcg_slab_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ p = container_of(w, struct memcg_cache_params, destroy);
+
+ cachep = memcg_params_to_cache(p);
+
+ /*
+ * If we get down to 0 after shrink, we could delete right away.
+ * However, memcg_release_pages() already puts us back in the workqueue
+ * in that case. If we proceed deleting, we'll get a dangling
+ * reference, and removing the object from the workqueue in that case
+ * is unnecessary complication. We are not a fast path.
+ *
+ * Note that this case is fundamentally different from racing with
+ * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+ * kmem_cache_shrink, not only we would be reinserting a dead cache
+ * into the queue, but doing so from inside the worker racing to
+ * destroy it.
+ *
+ * So if we aren't down to zero, we'll just schedule a worker and try
+ * again
+ */
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ return;
+ } else
+ kmem_cache_destroy(cachep);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+ if (!cachep->memcg_params->dead)
+ return;
+
+ /*
+ * There are many ways in which we can get here.
+ *
+ * We can get to a memory-pressure situation while the delayed work is
+ * still pending to run. The vmscan shrinkers can then release all
+ * cache memory and get us to destruction. If this is the case, we'll
+ * be executed twice, which is a bug (the second time will execute over
+ * bogus data). In this case, cancelling the work should be fine.
+ *
+ * But we can also get here from the worker itself, if
+ * kmem_cache_shrink is enough to shake all the remaining objects and
+ * get the page count to 0. In this case, we'll deadlock if we try to
+ * cancel the work (the worker runs with an internal lock held, which
+ * is the same lock we would hold for cancel_work_sync().)
+ *
+ * Since we can't possibly know who got us here, just refrain from
+ * running if there is already work pending
+ */
+ if (work_pending(&cachep->memcg_params->destroy))
+ return;
+ /*
+ * We have to defer the actual destroying to a workqueue, because
+ * we might currently be in a context that cannot sleep.
+ */
+ schedule_work(&cachep->memcg_params->destroy);
+}
+
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+ char *name;
+ struct dentry *dentry;
+
+ rcu_read_lock();
+ dentry = rcu_dereference(memcg->css.cgroup->dentry);
+ rcu_read_unlock();
+
+ BUG_ON(dentry == NULL);
+
+ name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+ memcg_cache_id(memcg), dentry->d_name.name);
+
+ return name;
+}
+
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
+{
+ char *name;
+ struct kmem_cache *new;
+
+ name = memcg_cache_name(memcg, s);
+ if (!name)
+ return NULL;
+
+ new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+ (s->flags & ~SLAB_PANIC), s->ctor, s);
+
+ if (new)
+ new->allocflags |= __GFP_KMEMCG;
+
+ kfree(name);
+ return new;
+}
- memcg_unregister_cache(c);
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct kmem_cache *new_cachep;
+ int idx;
+
+ BUG_ON(!memcg_can_account_kmem(memcg));
- if (cache_from_memcg_idx(s, i))
- failed++;
+ idx = memcg_cache_id(memcg);
+
+ mutex_lock(&memcg_cache_mutex);
+ new_cachep = cachep->memcg_params->memcg_caches[idx];
+ if (new_cachep)
+ goto out;
+
+ new_cachep = kmem_cache_dup(memcg, cachep);
+ if (new_cachep == NULL) {
+ new_cachep = cachep;
+ goto out;
}
- mutex_unlock(&memcg_slab_mutex);
- return failed;
+
+ mem_cgroup_get(memcg);
+ atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+
+ cachep->memcg_params->memcg_caches[idx] = new_cachep;
+ /*
+ * the readers won't lock, make sure everybody sees the updated value,
+ * so they won't put stuff in the queue again for no reason
+ */
+ wmb();
+out:
+ mutex_unlock(&memcg_cache_mutex);
+ return new_cachep;
}
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
- struct kmem_cache *cachep;
- struct memcg_cache_params *params, *tmp;
+ struct kmem_cache *c;
+ int i;
- if (!memcg_kmem_is_active(memcg))
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
return;
- mutex_lock(&memcg_slab_mutex);
- list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- cachep = memcg_params_to_cache(params);
- kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- memcg_unregister_cache(cachep);
+ /*
+ * If the cache is being destroyed, we trust that there is no one else
+ * requesting objects from it. Even if there are, the sanity checks in
+ * kmem_cache_destroy should caught this ill-case.
+ *
+ * Still, we don't want anyone else freeing memcg_caches under our
+ * noses, which can happen if a new memcg comes to life. As usual,
+ * we'll take the set_limit_mutex to protect ourselves against this.
+ */
+ mutex_lock(&set_limit_mutex);
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ c = s->memcg_params->memcg_caches[i];
+ if (!c)
+ continue;
+
+ /*
+ * We will now manually delete the caches, so to avoid races
+ * we need to cancel all pending destruction workers and
+ * proceed with destruction ourselves.
+ *
+ * kmem_cache_destroy() will call kmem_cache_shrink internally,
+ * and that could spawn the workers again: it is likely that
+ * the cache still have active pages until this very moment.
+ * This would lead us back to mem_cgroup_destroy_cache.
+ *
+ * But that will not execute at all if the "dead" flag is not
+ * set, so flip it down to guarantee we are in control.
+ */
+ c->memcg_params->dead = false;
+ cancel_work_sync(&c->memcg_params->destroy);
+ kmem_cache_destroy(c);
}
- mutex_unlock(&memcg_slab_mutex);
+ mutex_unlock(&set_limit_mutex);
}
-struct memcg_register_cache_work {
+struct create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
-static void memcg_register_cache_func(struct work_struct *w)
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
- struct memcg_register_cache_work *cw =
- container_of(w, struct memcg_register_cache_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
- mutex_lock(&memcg_slab_mutex);
- memcg_register_cache(memcg, cachep);
- mutex_unlock(&memcg_slab_mutex);
+ if (!memcg_kmem_is_active(memcg))
+ return;
- css_put(&memcg->css);
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ cachep = memcg_params_to_cache(params);
+ cachep->memcg_params->dead = true;
+ INIT_WORK(&cachep->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
+ schedule_work(&cachep->memcg_params->destroy);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+ struct create_work *cw;
+
+ cw = container_of(w, struct create_work, work);
+ memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ /* Drop the reference gotten when we enqueued. */
+ css_put(&cw->memcg->css);
kfree(cw);
}
/*
* Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
*/
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct memcg_register_cache_work *cw;
+ struct create_work *cw;
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
- if (cw == NULL) {
- css_put(&memcg->css);
+ cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+ if (cw == NULL)
+ return;
+
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget(&memcg->css)) {
+ kfree(cw);
return;
}
cw->memcg = memcg;
cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_register_cache_func);
+ INIT_WORK(&cw->work, memcg_create_cache_work_func);
schedule_work(&cw->work);
}
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_register_cache will recurse.
+ * in __memcg_create_cache_enqueue will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -3304,27 +3364,9 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
memcg_stop_kmem_account();
- __memcg_schedule_register_cache(memcg, cachep);
+ __memcg_create_cache_enqueue(memcg, cachep);
memcg_resume_kmem_account();
}
-
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
- int res;
-
- res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
- PAGE_SIZE << order);
- if (!res)
- atomic_add(1 << order, &cachep->memcg_params->nr_pages);
- return res;
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
- memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
- atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
-}
-
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -3342,7 +3384,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
+ int idx;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3352,39 +3394,43 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+ rcu_read_unlock();
if (!memcg_can_account_kmem(memcg))
- goto out;
-
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
- if (likely(memcg_cachep)) {
- cachep = memcg_cachep;
- goto out;
- }
+ return cachep;
- /* The corresponding put will be done in the workqueue. */
- if (!css_tryget_online(&memcg->css))
- goto out;
- rcu_read_unlock();
+ idx = memcg_cache_id(memcg);
/*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
+ * barrier to mare sure we're always seeing the up to date value. The
+ * code updating memcg_caches will issue a write barrier to match this.
*/
- memcg_schedule_register_cache(memcg, cachep);
- return cachep;
-out:
- rcu_read_unlock();
- return cachep;
+ read_barrier_depends();
+ if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * kmem_cache_dup, this means no further allocation could happen
+ * with the slab_mutex held.
+ *
+ * Also, because cache creation issue get_online_cpus(), this
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+ * that ends up reversed during cpu hotplug. (cpuset allocates
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+ * better to defer everything.
+ */
+ memcg_create_cache_enqueue(memcg, cachep);
+ return cachep;
+ }
+
+ return cachep->memcg_params->memcg_caches[idx];
}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
/*
* We need to verify if the allocation against current->mm->owner's memcg is
@@ -3407,37 +3453,16 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
int ret;
*_memcg = NULL;
+ memcg = try_get_mem_cgroup_from_mm(current->mm);
/*
- * Disabling accounting is only relevant for some specific memcg
- * internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are
- * accounted to kmemcg (alloc_kmem_pages and friends) only happen
- * outside memcg core. We are mostly concerned with cache allocations,
- * and by having this test at memcg_kmem_get_cache, we are already able
- * to relay the allocation to the root cache and bypass the memcg cache
- * altogether.
- *
- * There is one exception, though: the SLUB allocator does not create
- * large order caches, but rather service large kmallocs directly from
- * the page allocator. Therefore, the following sequence when backed by
- * the SLUB allocator:
- *
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
- *
- * would effectively ignore the fact that we should skip accounting,
- * since it will drive us directly to this function without passing
- * through the cache selector memcg_kmem_get_cache. Such large
- * allocations are extremely rare but can happen, for instance, for the
- * cache arrays. We bring this test here.
+ * very rare case described in mem_cgroup_from_task. Unfortunately there
+ * isn't much we can do without complicating this too much, and it would
+ * be gfp-dependent anyway. Just let it go
*/
- if (!current->mm || current->memcg_kmem_skip_account)
+ if (unlikely(!memcg))
return true;
- memcg = get_mem_cgroup_from_mm(current->mm);
-
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
return true;
@@ -3499,11 +3524,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
-static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -3521,21 +3546,16 @@ void mem_cgroup_split_huge_fixup(struct page *head)
{
struct page_cgroup *head_pc = lookup_page_cgroup(head);
struct page_cgroup *pc;
- struct mem_cgroup *memcg;
int i;
if (mem_cgroup_disabled())
return;
-
- memcg = head_pc->mem_cgroup;
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
- pc->mem_cgroup = memcg;
+ pc->mem_cgroup = head_pc->mem_cgroup;
smp_wmb();/* see __commit_charge() */
pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
- __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
- HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3565,7 +3585,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageLRU(page));
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3585,24 +3605,17 @@ static int mem_cgroup_move_account(struct page *page,
move_lock_mem_cgroup(from, &flags);
if (!anon && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
+ /* Update mapped_file data for mem_cgroup */
+ preempt_disable();
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ preempt_enable();
}
-
- mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+ mem_cgroup_charge_statistics(from, anon, -nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, page, anon, nr_pages);
+ mem_cgroup_charge_statistics(to, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -3664,7 +3677,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
}
@@ -3682,23 +3695,23 @@ out:
return ret;
}
-int mem_cgroup_charge_anon(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, enum charge_type ctype)
{
+ struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
- struct mem_cgroup *memcg;
bool oom = true;
-
- if (mem_cgroup_disabled())
- return 0;
-
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- VM_BUG_ON(!mm);
+ int ret;
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON(!PageTransHuge(page));
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -3706,14 +3719,25 @@ int mem_cgroup_charge_anon(struct page *page,
oom = false;
}
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, nr_pages,
- MEM_CGROUP_CHARGE_TYPE_ANON, false);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
+ if (ret == -ENOMEM)
+ return ret;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
return 0;
}
+int mem_cgroup_newpage_charge(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+ VM_BUG_ON(page_mapped(page));
+ VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON(!mm);
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_ANON);
+}
+
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
@@ -3725,7 +3749,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
gfp_t mask,
struct mem_cgroup **memcgp)
{
- struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *memcg;
struct page_cgroup *pc;
int ret;
@@ -3738,29 +3762,31 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
- goto out;
- if (do_swap_account)
- memcg = try_get_mem_cgroup_from_page(page);
+ return 0;
+ if (!do_swap_account)
+ goto charge_cur_mm;
+ memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
- memcg = get_mem_cgroup_from_mm(mm);
- ret = mem_cgroup_try_charge(memcg, mask, 1, true);
+ goto charge_cur_mm;
+ *memcgp = memcg;
+ ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
css_put(&memcg->css);
if (ret == -EINTR)
- memcg = root_mem_cgroup;
- else if (ret)
- return ret;
-out:
- *memcgp = memcg;
- return 0;
+ ret = 0;
+ return ret;
+charge_cur_mm:
+ ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
+ if (ret == -EINTR)
+ ret = 0;
+ return ret;
}
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
- if (mem_cgroup_disabled()) {
- *memcgp = NULL;
+ *memcgp = NULL;
+ if (mem_cgroup_disabled())
return 0;
- }
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
@@ -3768,13 +3794,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
- struct mem_cgroup *memcg;
+ int ret;
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- if (!memcg)
- return -ENOMEM;
- *memcgp = memcg;
- return 0;
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+ if (ret == -EINTR)
+ ret = 0;
+ return ret;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
@@ -3818,11 +3843,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
MEM_CGROUP_CHARGE_TYPE_ANON);
}
-int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
+ struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- struct mem_cgroup *memcg;
int ret;
if (mem_cgroup_disabled())
@@ -3830,20 +3855,15 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
if (PageCompound(page))
return 0;
- if (PageSwapCache(page)) { /* shmem */
+ if (!PageSwapCache(page))
+ ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+ else { /* page is swapcache/shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
- if (ret)
- return ret;
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
- return 0;
+ if (!ret)
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
}
-
- memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
- if (!memcg)
- return -ENOMEM;
- __mem_cgroup_commit_charge(memcg, page, 1, type, false);
- return 0;
+ return ret;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -3914,9 +3934,11 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (mem_cgroup_disabled())
return NULL;
+ VM_BUG_ON(PageSwapCache(page));
+
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON(!PageTransHuge(page));
}
/*
* Check if our page_cgroup is valid
@@ -3968,7 +3990,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
break;
}
- mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
+ mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -3981,12 +4003,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
unlock_page_cgroup(pc);
/*
* even after unlock, we have memcg->res.usage here and this memcg
- * will never be freed, so it's safe to call css_get().
+ * will never be freed.
*/
memcg_check_events(memcg, page);
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
mem_cgroup_swap_statistics(memcg, true);
- css_get(&memcg->css);
+ mem_cgroup_get(memcg);
}
/*
* Migration does not charge the res_counter for the
@@ -4008,19 +4030,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- /*
- * If the page is in swap cache, uncharge should be deferred
- * to the swap path, which also properly accounts swap usage
- * and handles memcg lifetime.
- *
- * Note that this check is not stable and reclaim may add the
- * page to swap cache at any time after this. However, if the
- * page is not in swap cache by the time page->mapcount hits
- * 0, there won't be any page table references to the swap
- * slot, and reclaim will free it and not actually write the
- * page to disk.
- */
+ VM_BUG_ON(page->mapping && !PageAnon(page));
if (PageSwapCache(page))
return;
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
@@ -4028,8 +4038,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping, page);
+ VM_BUG_ON(page_mapped(page));
+ VM_BUG_ON(page->mapping);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -4098,10 +4108,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
/*
* record memcg information, if swapout && memcg != NULL,
- * css_get() was called in uncharge().
+ * mem_cgroup_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, mem_cgroup_id(memcg));
+ swap_cgroup_record(ent, css_id(&memcg->css));
}
#endif
@@ -4123,13 +4133,13 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
memcg = mem_cgroup_lookup(id);
if (memcg) {
/*
- * We uncharge this because swap is freed. This memcg can
- * be obsolete one. We avoid calling css_tryget_online().
+ * We uncharge this because swap is freed.
+ * This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
+ mem_cgroup_put(memcg);
}
rcu_read_unlock();
}
@@ -4153,8 +4163,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = mem_cgroup_id(from);
- new_id = mem_cgroup_id(to);
+ old_id = css_id(&from->css);
+ new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -4163,14 +4173,11 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone css_get(to) because if
- * the process that has been moved to @to does swap-in, the
- * refcount of @to might be decreased to 0.
- *
- * We are in attach() phase, so the cgroup is guaranteed to be
- * alive, so we can just call css_get().
+ * improvement. But we cannot postpone mem_cgroup_get(to)
+ * because if the process that has been moved to @to does
+ * swap-in, the refcount of @to might be decreased to 0.
*/
- css_get(&to->css);
+ mem_cgroup_get(to);
return 0;
}
return -EINVAL;
@@ -4333,7 +4340,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
- mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
+ mem_cgroup_charge_statistics(memcg, false, -1);
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
@@ -4382,8 +4389,8 @@ void mem_cgroup_print_bad_page(struct page *page)
pc = lookup_page_cgroup_used(page);
if (pc) {
- pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
- pc, pc->flags, pc->mem_cgroup);
+ printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+ pc, pc->flags, pc->mem_cgroup);
}
}
#endif
@@ -4446,7 +4453,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
MEM_CGROUP_RECLAIM_SHRINK);
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
- if (curusage >= oldusage)
+ if (curusage >= oldusage)
retry_count--;
else
oldusage = curusage;
@@ -4467,7 +4474,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
- retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
while (retry_count) {
if (signal_pending(current)) {
@@ -4578,7 +4585,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
} while (1);
}
- __mem_cgroup_remove_exceeded(mz, mctz);
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
excess = res_counter_soft_limit_excess(&mz->memcg->res);
/*
* One school of thought says that we should not add
@@ -4589,7 +4596,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* term TODO.
*/
/* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
@@ -4656,9 +4663,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
if (mem_cgroup_move_parent(page, pc, memcg)) {
/* found lock contention or "pc" is obsolete. */
busy = page;
+ cond_resched();
} else
busy = NULL;
- cond_resched();
} while (!list_empty(list));
}
@@ -4710,30 +4717,6 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
}
/*
- * Test whether @memcg has children, dead or alive. Note that this
- * function doesn't care whether @memcg has use_hierarchy enabled and
- * returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
- */
-static inline bool memcg_has_children(struct mem_cgroup *memcg)
-{
- bool ret;
-
- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
- rcu_read_lock();
- ret = css_next_child(NULL, &memcg->css);
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Reclaims as many pages from the given memcg as possible and moves
* the rest to the parent.
*
@@ -4742,6 +4725,11 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct cgroup *cgrp = memcg->css.cgroup;
+
+ /* returns EBUSY if there is a task or if we come here twice. */
+ if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ return -EBUSY;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -4761,35 +4749,44 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
}
}
+ lru_add_drain();
+ mem_cgroup_reparent_charges(memcg);
return 0;
}
-static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
+static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ int ret;
if (mem_cgroup_is_root(memcg))
return -EINVAL;
- return mem_cgroup_force_empty(memcg) ?: nbytes;
+ css_get(&memcg->css);
+ ret = mem_cgroup_force_empty(memcg);
+ css_put(&memcg->css);
+
+ return ret;
}
-static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+
+static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
{
- return mem_cgroup_from_css(css)->use_hierarchy;
+ return mem_cgroup_from_cont(cont)->use_hierarchy;
}
-static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
+static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
+ u64 val)
{
int retval = 0;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct cgroup *parent = cont->parent;
+ struct mem_cgroup *parent_memcg = NULL;
- mutex_lock(&memcg_create_mutex);
+ if (parent)
+ parent_memcg = mem_cgroup_from_cont(parent);
+
+ cgroup_lock();
if (memcg->use_hierarchy == val)
goto out;
@@ -4804,7 +4801,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (!memcg_has_children(memcg))
+ if (list_empty(&cont->children))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -4812,7 +4809,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
retval = -EINVAL;
out:
- mutex_unlock(&memcg_create_mutex);
+ cgroup_unlock();
return retval;
}
@@ -4844,10 +4841,6 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return res_counter_read_u64(&memcg->memsw, RES_USAGE);
}
- /*
- * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
- * as well as in MEM_CGROUP_STAT_RSS_HUGE.
- */
val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
@@ -4857,17 +4850,22 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ char str[64];
u64 val;
- int name;
+ int name, len;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
+
switch (type) {
case _MEM:
if (name == RES_USAGE)
@@ -4888,26 +4886,17 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
BUG();
}
- return val;
+ len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+ return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
-#ifdef CONFIG_MEMCG_KMEM
-/* should be called with activate_kmem_mutex held */
-static int __memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long long limit)
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
{
- int err = 0;
- int memcg_id;
-
- if (memcg_kmem_is_active(memcg))
- return 0;
-
- /*
- * We are going to allocate memory for data shared by all memory
- * cgroups so let's stop accounting here.
- */
- memcg_stop_kmem_account();
+ int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+ bool must_inc_static_branch = false;
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -4919,78 +4908,64 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
*
* After it first became limited, changes in the value of the limit are
* of course permitted.
+ *
+ * Taking the cgroup_lock is really offensive, but it is so far the only
+ * way to guarantee that no children will appear. There are plenty of
+ * other offenders, and they should all go away. Fine grained locking
+ * is probably the way to go here. When we are fully hierarchical, we
+ * can also get rid of the use_hierarchy check.
*/
- mutex_lock(&memcg_create_mutex);
- if (cgroup_has_tasks(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
-
- memcg_id = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
-
- /*
- * Make sure we have enough space for this cgroup in each root cache's
- * memcg_params.
- */
- mutex_lock(&memcg_slab_mutex);
- err = memcg_update_all_caches(memcg_id + 1);
- mutex_unlock(&memcg_slab_mutex);
- if (err)
- goto out_rmid;
-
- memcg->kmemcg_id = memcg_id;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ cgroup_lock();
+ mutex_lock(&set_limit_mutex);
+ if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+ if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+ !list_empty(&cont->children))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ ret = res_counter_set_limit(&memcg->kmem, val);
+ VM_BUG_ON(ret);
- /*
- * We couldn't have accounted to this cgroup, because it hasn't got the
- * active bit set yet, so this should succeed.
- */
- err = res_counter_set_limit(&memcg->kmem, limit);
- VM_BUG_ON(err);
+ ret = memcg_update_cache_sizes(memcg);
+ if (ret) {
+ res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+ goto out;
+ }
+ must_inc_static_branch = true;
+ /*
+ * kmem charges can outlive the cgroup. In the case of slab
+ * pages, for instance, a page contain objects from various
+ * processes, so it is unfeasible to migrate them away. We
+ * need to reference count the memcg because of that.
+ */
+ mem_cgroup_get(memcg);
+ } else
+ ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+ mutex_unlock(&set_limit_mutex);
+ cgroup_unlock();
- static_key_slow_inc(&memcg_kmem_enabled_key);
/*
- * Setting the active bit after enabling static branching will
- * guarantee no one starts accounting before all call sites are
- * patched.
+ * We are by now familiar with the fact that we can't inc the static
+ * branch inside cgroup_lock. See disarm functions for details. A
+ * worker here is overkill, but also wrong: After the limit is set, we
+ * must start accounting right away. Since this operation can't fail,
+ * we can safely defer it to here - no rollback will be needed.
+ *
+ * The boolean used to control this is also safe, because
+ * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+ * able to set it to true;
*/
- memcg_kmem_set_active(memcg);
-out:
- memcg_resume_kmem_account();
- return err;
-
-out_rmid:
- ida_simple_remove(&kmem_limited_groups, memcg_id);
- goto out;
-}
-
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long long limit)
-{
- int ret;
-
- mutex_lock(&activate_kmem_mutex);
- ret = __memcg_activate_kmem(memcg, limit);
- mutex_unlock(&activate_kmem_mutex);
- return ret;
-}
-
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long long val)
-{
- int ret;
+ if (must_inc_static_branch) {
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * setting the active bit after the inc will guarantee no one
+ * starts accounting before all call sites are patched
+ */
+ memcg_kmem_set_active(memcg);
+ }
- if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, val);
- else
- ret = res_counter_set_limit(&memcg->kmem, val);
+#endif
return ret;
}
@@ -4998,44 +4973,59 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-
if (!parent)
- return 0;
+ goto out;
+
+ memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * When that happen, we need to disable the static branch only on those
+ * memcgs that enabled it. To achieve this, we would be forced to
+ * complicate the code by keeping track of which memcgs were the ones
+ * that actually enabled limits, and which ones got it from its
+ * parents.
+ *
+ * It is a lot simpler just to do static_key_slow_inc() on every child
+ * that is accounted.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out;
- mutex_lock(&activate_kmem_mutex);
/*
- * If the parent cgroup is not kmem-active now, it cannot be activated
- * after this point, because it has at least one child already.
+ * destroy(), called if we fail, will issue static_key_slow_inc() and
+ * mem_cgroup_put() if kmem is enabled. We have to either call them
+ * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+ * this more consistent, since it always leads to the same destroy path
*/
- if (memcg_kmem_is_active(parent))
- ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
- mutex_unlock(&activate_kmem_mutex);
+ mem_cgroup_get(memcg);
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+
+ mutex_lock(&set_limit_mutex);
+ ret = memcg_update_cache_sizes(memcg);
+ mutex_unlock(&set_limit_mutex);
+#endif
+out:
return ret;
}
-#else
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long long val)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
* RES_LIMIT.
*/
-static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ const char *buffer)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
enum res_type type;
int name;
unsigned long long val;
int ret;
- buf = strstrip(buf);
- type = MEMFILE_TYPE(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ type = MEMFILE_TYPE(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
switch (name) {
case RES_LIMIT:
@@ -5044,7 +5034,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
break;
}
/* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buf, &val);
+ ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
if (type == _MEM)
@@ -5052,12 +5042,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(memcg, val);
+ ret = memcg_update_kmem_limit(cont, val);
else
return -EINVAL;
break;
case RES_SOFT_LIMIT:
- ret = res_counter_memparse_write_strategy(buf, &val);
+ ret = res_counter_memparse_write_strategy(buffer, &val);
if (ret)
break;
/*
@@ -5074,21 +5064,24 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
ret = -EINVAL; /* should be BUG() ? */
break;
}
- return ret ?: nbytes;
+ return ret;
}
static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
unsigned long long *mem_limit, unsigned long long *memsw_limit)
{
+ struct cgroup *cgroup;
unsigned long long min_limit, min_memsw_limit, tmp;
min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ cgroup = memcg->css.cgroup;
if (!memcg->use_hierarchy)
goto out;
- while (memcg->css.parent) {
- memcg = mem_cgroup_from_css(memcg->css.parent);
+ while (cgroup->parent) {
+ cgroup = cgroup->parent;
+ memcg = mem_cgroup_from_cont(cgroup);
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5101,15 +5094,17 @@ out:
*memsw_limit = min_memsw_limit;
}
-static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off)
+static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
int name;
enum res_type type;
- type = MEMFILE_TYPE(of_cft(of)->private);
- name = MEMFILE_ATTR(of_cft(of)->private);
+ type = MEMFILE_TYPE(event);
+ name = MEMFILE_ATTR(event);
+
+ if (!do_swap_account && type == _MEMSWAP)
+ return -EOPNOTSUPP;
switch (name) {
case RES_MAX_USAGE:
@@ -5134,35 +5129,36 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
break;
}
- return nbytes;
+ return 0;
}
-static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
+static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
struct cftype *cft)
{
- return mem_cgroup_from_css(css)->move_charge_at_immigrate;
+ return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
}
#ifdef CONFIG_MMU
-static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
-
/*
- * No kind of locking is needed in here, because ->can_attach() will
- * check this value once in the beginning of the process, and then carry
- * on with stale data. This means that changes to this value will only
- * affect task migrations starting after the change.
+ * We check this value several times in both in can_attach() and
+ * attach(), so we need cgroup lock to prevent this value from being
+ * inconsistent.
*/
+ cgroup_lock();
memcg->move_charge_at_immigrate = val;
+ cgroup_unlock();
+
return 0;
}
#else
-static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
return -ENOSYS;
@@ -5170,64 +5166,69 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct seq_file *m, void *v)
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
{
- struct numa_stat {
- const char *name;
- unsigned int lru_mask;
- };
-
- static const struct numa_stat stats[] = {
- { "total", LRU_ALL },
- { "file", LRU_ALL_FILE },
- { "anon", LRU_ALL_ANON },
- { "unevictable", BIT(LRU_UNEVICTABLE) },
- };
- const struct numa_stat *stat;
int nid;
- unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-
- for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
- seq_putc(m, '\n');
+ unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+ unsigned long node_nr;
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+
+ total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
+ seq_printf(m, "total=%lu", total_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
}
+ seq_putc(m, '\n');
- for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
- seq_putc(m, '\n');
+ file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
+ seq_printf(m, "file=%lu", file_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_FILE);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
}
+ seq_putc(m, '\n');
+ anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
+ seq_printf(m, "anon=%lu", anon_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_ANON);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
+ seq_printf(m, "unevictable=%lu", unevictable_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ BIT(LRU_UNEVICTABLE));
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
+static const char * const mem_cgroup_lru_names[] = {
+ "inactive_anon",
+ "active_anon",
+ "inactive_file",
+ "active_file",
+ "unevictable",
+};
+
static inline void mem_cgroup_lru_names_not_uptodate(void)
{
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int memcg_stat_show(struct seq_file *m, void *v)
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
struct mem_cgroup *mi;
unsigned int i;
@@ -5293,7 +5294,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
rstat = &mz->lruvec.reclaim_stat;
recent_rotated[0] += rstat->recent_rotated[0];
@@ -5311,26 +5312,39 @@ static int memcg_stat_show(struct seq_file *m, void *v)
return 0;
}
-static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
return mem_cgroup_swappiness(memcg);
}
-static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
if (val > 100)
return -EINVAL;
- if (css->parent)
- memcg->swappiness = val;
- else
- vm_swappiness = val;
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+
+ parent = mem_cgroup_from_cont(cgrp->parent);
+
+ cgroup_lock();
+
+ /* If under hierarchy, only empty-root can set this value */
+ if ((parent->use_hierarchy) ||
+ (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+ cgroup_unlock();
+ return -EINVAL;
+ }
+
+ memcg->swappiness = val;
+
+ cgroup_unlock();
return 0;
}
@@ -5402,13 +5416,7 @@ static int compare_thresholds(const void *a, const void *b)
const struct mem_cgroup_threshold *_a = a;
const struct mem_cgroup_threshold *_b = b;
- if (_a->threshold > _b->threshold)
- return 1;
-
- if (_a->threshold < _b->threshold)
- return -1;
-
- return 0;
+ return _a->threshold - _b->threshold;
}
static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
@@ -5428,11 +5436,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
-static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -5509,23 +5519,13 @@ unlock:
return ret;
}
-static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
-}
-
-static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
-}
-
-static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, enum res_type type)
+static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -5598,23 +5598,14 @@ unlock:
mutex_unlock(&memcg->thresholds_lock);
}
-static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
-}
-
-static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
-}
-
-static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
+static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
+ enum res_type type = MEMFILE_TYPE(cft->private);
+ BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -5632,10 +5623,14 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
return 0;
}
-static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
+ enum res_type type = MEMFILE_TYPE(cft->private);
+
+ BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
@@ -5649,28 +5644,43 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
spin_unlock(&memcg_oom_lock);
}
-static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
+static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
+ struct cftype *cft, struct cgroup_map_cb *cb)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
- seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
- seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
+
+ if (atomic_read(&memcg->under_oom))
+ cb->fill(cb, "under_oom", 1);
+ else
+ cb->fill(cb, "under_oom", 0);
return 0;
}
-static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
+static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!css->parent || !((val == 0) || (val == 1)))
+ if (!cgrp->parent || !((val == 0) || (val == 1)))
return -EINVAL;
+ parent = mem_cgroup_from_cont(cgrp->parent);
+
+ cgroup_lock();
+ /* oom-kill-disable is a flag for subhierarchy. */
+ if ((parent->use_hierarchy) ||
+ (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+ cgroup_unlock();
+ return -EINVAL;
+ }
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
-
+ cgroup_unlock();
return 0;
}
@@ -5685,45 +5695,25 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return ret;
return mem_cgroup_sockets_init(memcg, ss);
-}
+};
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
mem_cgroup_sockets_destroy(memcg);
-}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
-{
- if (!memcg_kmem_is_active(memcg))
- return;
-
- /*
- * kmem charges can outlive the cgroup. In the case of slab
- * pages, for instance, a page contain objects from various
- * processes. As we prevent from taking a reference for every
- * such allocation we have to be careful when doing uncharge
- * (see memcg_uncharge_kmem) and here during offlining.
- *
- * The idea is that that only the _last_ uncharge which sees
- * the dead memcg will drop the last reference. An additional
- * reference is taken here before the group is marked dead
- * which is then paired with css_put during uncharge resp. here.
- *
- * Although this might sound strange as this path is called from
- * css_offline() when the referencemight have dropped down to 0 and
- * shouldn't be incremented anymore (css_tryget_online() would
- * fail) we do not have other options because of the kmem
- * allocations lifetime.
- */
- css_get(&memcg->css);
memcg_kmem_mark_dead(memcg);
if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
return;
+ /*
+ * Charges already down to 0, undo mem_cgroup_get() done in the charge
+ * path here, being careful not to race with memcg_uncharge_kmem: it is
+ * possible that the charges went down to 0 between mark_dead and the
+ * res_counter read, so in that case, we don't need the put
+ */
if (memcg_kmem_test_and_clear_dead(memcg))
- css_put(&memcg->css);
+ mem_cgroup_put(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5731,289 +5721,57 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
-}
-
-static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
}
#endif
-/*
- * DO NOT USE IN NEW FILES.
- *
- * "cgroup.event_control" implementation.
- *
- * This is way over-engineered. It tries to support fully configurable
- * events for each user. Such level of flexibility is completely
- * unnecessary especially in the light of the planned unified hierarchy.
- *
- * Please deprecate this and replace with something simpler if at all
- * possible.
- */
-
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void memcg_event_remove(struct work_struct *work)
-{
- struct mem_cgroup_event *event =
- container_of(work, struct mem_cgroup_event, remove);
- struct mem_cgroup *memcg = event->memcg;
-
- remove_wait_queue(event->wqh, &event->wait);
-
- event->unregister_event(memcg, event->eventfd);
-
- /* Notify userspace the event is going away. */
- eventfd_signal(event->eventfd, 1);
-
- eventfd_ctx_put(event->eventfd);
- kfree(event);
- css_put(&memcg->css);
-}
-
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
- int sync, void *key)
-{
- struct mem_cgroup_event *event =
- container_of(wait, struct mem_cgroup_event, wait);
- struct mem_cgroup *memcg = event->memcg;
- unsigned long flags = (unsigned long)key;
-
- if (flags & POLLHUP) {
- /*
- * If the event has been detached at cgroup removal, we
- * can simply return knowing the other side will cleanup
- * for us.
- *
- * We can't race against event freeing since the other
- * side will require wqh->lock via remove_wait_queue(),
- * which we hold.
- */
- spin_lock(&memcg->event_list_lock);
- if (!list_empty(&event->list)) {
- list_del_init(&event->list);
- /*
- * We are in atomic context, but cgroup_event_remove()
- * may sleep, so we have to call it in workqueue.
- */
- schedule_work(&event->remove);
- }
- spin_unlock(&memcg->event_list_lock);
- }
-
- return 0;
-}
-
-static void memcg_event_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
-{
- struct mem_cgroup_event *event =
- container_of(pt, struct mem_cgroup_event, pt);
-
- event->wqh = wqh;
- add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * DO NOT USE IN NEW FILES.
- *
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup_subsys_state *css = of_css(of);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup_event *event;
- struct cgroup_subsys_state *cfile_css;
- unsigned int efd, cfd;
- struct fd efile;
- struct fd cfile;
- const char *name;
- char *endp;
- int ret;
-
- buf = strstrip(buf);
-
- efd = simple_strtoul(buf, &endp, 10);
- if (*endp != ' ')
- return -EINVAL;
- buf = endp + 1;
-
- cfd = simple_strtoul(buf, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
- return -EINVAL;
- buf = endp + 1;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- event->memcg = memcg;
- INIT_LIST_HEAD(&event->list);
- init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
- init_waitqueue_func_entry(&event->wait, memcg_event_wake);
- INIT_WORK(&event->remove, memcg_event_remove);
-
- efile = fdget(efd);
- if (!efile.file) {
- ret = -EBADF;
- goto out_kfree;
- }
-
- event->eventfd = eventfd_ctx_fileget(efile.file);
- if (IS_ERR(event->eventfd)) {
- ret = PTR_ERR(event->eventfd);
- goto out_put_efile;
- }
-
- cfile = fdget(cfd);
- if (!cfile.file) {
- ret = -EBADF;
- goto out_put_eventfd;
- }
-
- /* the process need read permission on control file */
- /* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile.file), MAY_READ);
- if (ret < 0)
- goto out_put_cfile;
-
- /*
- * Determine the event callbacks and set them in @event. This used
- * to be done via struct cftype but cgroup core no longer knows
- * about these events. The following is crude but the whole thing
- * is for compatibility anyway.
- *
- * DO NOT ADD NEW FILES.
- */
- name = cfile.file->f_dentry->d_name.name;
-
- if (!strcmp(name, "memory.usage_in_bytes")) {
- event->register_event = mem_cgroup_usage_register_event;
- event->unregister_event = mem_cgroup_usage_unregister_event;
- } else if (!strcmp(name, "memory.oom_control")) {
- event->register_event = mem_cgroup_oom_register_event;
- event->unregister_event = mem_cgroup_oom_unregister_event;
- } else if (!strcmp(name, "memory.pressure_level")) {
- event->register_event = vmpressure_register_event;
- event->unregister_event = vmpressure_unregister_event;
- } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
- event->register_event = memsw_cgroup_usage_register_event;
- event->unregister_event = memsw_cgroup_usage_unregister_event;
- } else {
- ret = -EINVAL;
- goto out_put_cfile;
- }
-
- /*
- * Verify @cfile should belong to @css. Also, remaining events are
- * automatically removed on cgroup destruction but the removal is
- * asynchronous, so take an extra ref on @css.
- */
- cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
- &memory_cgrp_subsys);
- ret = -EINVAL;
- if (IS_ERR(cfile_css))
- goto out_put_cfile;
- if (cfile_css != css) {
- css_put(cfile_css);
- goto out_put_cfile;
- }
-
- ret = event->register_event(memcg, event->eventfd, buf);
- if (ret)
- goto out_put_css;
-
- efile.file->f_op->poll(efile.file, &event->pt);
-
- spin_lock(&memcg->event_list_lock);
- list_add(&event->list, &memcg->event_list);
- spin_unlock(&memcg->event_list_lock);
-
- fdput(cfile);
- fdput(efile);
-
- return nbytes;
-
-out_put_css:
- css_put(css);
-out_put_cfile:
- fdput(cfile);
-out_put_eventfd:
- eventfd_ctx_put(event->eventfd);
-out_put_efile:
- fdput(efile);
-out_kfree:
- kfree(event);
-
- return ret;
-}
-
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
+ .read = mem_cgroup_read,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
},
{
.name = "stat",
- .seq_show = memcg_stat_show,
+ .read_seq_string = memcg_stat_show,
},
{
.name = "force_empty",
- .write = mem_cgroup_force_empty_write,
+ .trigger = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
- .flags = CFTYPE_INSANE,
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
- .name = "cgroup.event_control", /* XXX: for compat */
- .write = memcg_write_event_control,
- .flags = CFTYPE_NO_PREFIX,
- .mode = S_IWUGO,
- },
- {
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
@@ -6025,81 +5783,79 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
- .seq_show = mem_cgroup_oom_control_read,
+ .read_map = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
+ .register_event = mem_cgroup_oom_register_event,
+ .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
- {
- .name = "pressure_level",
- },
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .seq_show = memcg_numa_stat_show,
+ .read_seq_string = memcg_numa_stat_show,
+ },
+#endif
+#ifdef CONFIG_MEMCG_SWAP
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read = mem_cgroup_read,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
+ .read = mem_cgroup_read,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .seq_show = mem_cgroup_slabinfo_read,
+ .read_seq_string = mem_cgroup_slabinfo_read,
},
#endif
#endif
{ }, /* terminate */
};
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
- {
- .name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- { }, /* terminate */
-};
-#endif
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -6126,24 +5882,26 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
mz->on_tree = false;
mz->memcg = memcg;
}
- memcg->nodeinfo[node] = pn;
+ memcg->info.nodeinfo[node] = pn;
return 0;
}
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
- kfree(memcg->nodeinfo[node]);
+ kfree(memcg->info.nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ int size = sizeof(struct mem_cgroup);
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ /* Can be very big if MAX_NUMNODES is very big */
+ if (size < PAGE_SIZE)
+ memcg = kzalloc(size, GFP_KERNEL);
+ else
+ memcg = vzalloc(size);
- memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -6154,7 +5912,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- kfree(memcg);
+ if (size < PAGE_SIZE)
+ kfree(memcg);
+ else
+ vfree(memcg);
return NULL;
}
@@ -6172,8 +5933,10 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
+ int size = sizeof(struct mem_cgroup);
mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -6192,7 +5955,53 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- kfree(memcg);
+ if (size < PAGE_SIZE)
+ kfree(memcg);
+ else
+ vfree(memcg);
+}
+
+
+/*
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * but in process context. The work_freeing structure is overlaid
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
+ */
+static void free_work(struct work_struct *work)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(work, struct mem_cgroup, work_freeing);
+ __mem_cgroup_free(memcg);
+}
+
+static void free_rcu(struct rcu_head *rcu_head)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+ INIT_WORK(&memcg->work_freeing, free_work);
+ schedule_work(&memcg->work_freeing);
+}
+
+static void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+ atomic_inc(&memcg->refcnt);
+}
+
+static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
+{
+ if (atomic_sub_and_test(count, &memcg->refcnt)) {
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
+ if (parent)
+ mem_cgroup_put(parent);
+ }
+}
+
+static void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+ __mem_cgroup_put(memcg, 1);
}
/*
@@ -6206,7 +6015,19 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
+#ifdef CONFIG_MEMCG_SWAP
+static void __init enable_swap_cgroup(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account)
+ do_swap_account = 1;
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
+
+static int mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
@@ -6217,7 +6038,8 @@ static void __init mem_cgroup_soft_limit_tree_init(void)
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- BUG_ON(!rtpn);
+ if (!rtpn)
+ goto err_cleanup;
soft_limit_tree.rb_tree_per_node[node] = rtpn;
@@ -6227,12 +6049,23 @@ static void __init mem_cgroup_soft_limit_tree_init(void)
spin_lock_init(&rtpz->lock);
}
}
+ return 0;
+
+err_cleanup:
+ for_each_node(node) {
+ if (!soft_limit_tree.rb_tree_per_node[node])
+ break;
+ kfree(soft_limit_tree.rb_tree_per_node[node]);
+ soft_limit_tree.rb_tree_per_node[node] = NULL;
+ }
+ return 1;
+
}
static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+mem_cgroup_css_alloc(struct cgroup *cont)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
int node;
@@ -6245,56 +6078,36 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
goto free_out;
/* root ? */
- if (parent_css == NULL) {
+ if (cont->parent == NULL) {
+ int cpu;
+ enable_swap_cgroup();
+ parent = NULL;
+ if (mem_cgroup_soft_limit_tree_init())
+ goto free_out;
root_mem_cgroup = memcg;
- res_counter_init(&memcg->res, NULL);
- res_counter_init(&memcg->memsw, NULL);
- res_counter_init(&memcg->kmem, NULL);
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock =
+ &per_cpu(memcg_stock, cpu);
+ INIT_WORK(&stock->work, drain_local_stock);
+ }
+ } else {
+ parent = mem_cgroup_from_cont(cont->parent);
+ memcg->use_hierarchy = parent->use_hierarchy;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
}
- memcg->last_scanned_node = MAX_NUMNODES;
- INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
- mutex_init(&memcg->thresholds_lock);
- spin_lock_init(&memcg->move_lock);
- vmpressure_init(&memcg->vmpressure);
- INIT_LIST_HEAD(&memcg->event_list);
- spin_lock_init(&memcg->event_list_lock);
-
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
-}
-
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
- if (!parent)
- return 0;
-
- mutex_lock(&memcg_create_mutex);
-
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
-
- if (parent->use_hierarchy) {
+ if (parent && parent->use_hierarchy) {
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
res_counter_init(&memcg->kmem, &parent->kmem);
/*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
+ * We increment refcnt of the parent to ensure that we can
+ * safely access it on res_counter_charge/uncharge.
+ * This refcnt will be decremented when freeing this
+ * mem_cgroup(see mem_cgroup_put).
*/
+ mem_cgroup_get(parent);
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
@@ -6304,107 +6117,50 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
* much sense so let cgroup subsystem know about this
* unfortunate state in our controller.
*/
- if (parent != root_mem_cgroup)
- memory_cgrp_subsys.broken_hierarchy = true;
+ if (parent && parent != root_mem_cgroup)
+ mem_cgroup_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);
-
- return memcg_init_kmem(memcg, &memory_cgrp_subsys);
-}
-
-/*
- * Announce all parents that a group from their hierarchy is gone.
- */
-static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
-{
- struct mem_cgroup *parent = memcg;
+ memcg->last_scanned_node = MAX_NUMNODES;
+ INIT_LIST_HEAD(&memcg->oom_notify);
- while ((parent = parent_mem_cgroup(parent)))
- mem_cgroup_iter_invalidate(parent);
+ if (parent)
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ atomic_set(&memcg->refcnt, 1);
+ memcg->move_charge_at_immigrate = 0;
+ mutex_init(&memcg->thresholds_lock);
+ spin_lock_init(&memcg->move_lock);
- /*
- * if the root memcg is not hierarchical we have to check it
- * explicitely.
- */
- if (!root_mem_cgroup->use_hierarchy)
- mem_cgroup_iter_invalidate(root_mem_cgroup);
+ error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+ if (error) {
+ /*
+ * We call put now because our (and parent's) refcnts
+ * are already in place. mem_cgroup_put() will internally
+ * call __mem_cgroup_free, so return directly
+ */
+ mem_cgroup_put(memcg);
+ return ERR_PTR(error);
+ }
+ return &memcg->css;
+free_out:
+ __mem_cgroup_free(memcg);
+ return ERR_PTR(error);
}
-static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+static void mem_cgroup_css_offline(struct cgroup *cont)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup_event *event, *tmp;
- struct cgroup_subsys_state *iter;
-
- /*
- * Unregister events and notify userspace.
- * Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace.
- */
- spin_lock(&memcg->event_list_lock);
- list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
- list_del_init(&event->list);
- schedule_work(&event->remove);
- }
- spin_unlock(&memcg->event_list_lock);
-
- kmem_cgroup_css_offline(memcg);
-
- mem_cgroup_invalidate_reclaim_iterators(memcg);
-
- /*
- * This requires that offlining is serialized. Right now that is
- * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
- */
- css_for_each_descendant_post(iter, css)
- mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- memcg_unregister_all_caches(memcg);
- vmpressure_cleanup(&memcg->vmpressure);
+ mem_cgroup_reparent_charges(memcg);
+ mem_cgroup_destroy_all_caches(memcg);
}
-static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+static void mem_cgroup_css_free(struct cgroup *cont)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- /*
- * XXX: css_offline() would be where we should reparent all
- * memory to prepare the cgroup for destruction. However,
- * memcg does not do css_tryget_online() and res_counter charging
- * under the same RCU lock region, which means that charging
- * could race with offlining. Offlining only happens to
- * cgroups with no tasks in them but charges can show up
- * without any tasks from the swapin path when the target
- * memcg is looked up from the swapout record and not from the
- * current task as it usually is. A race like this can leak
- * charges and put pages with stale cgroup pointers into
- * circulation:
- *
- * #0 #1
- * lookup_swap_cgroup_id()
- * rcu_read_lock()
- * mem_cgroup_lookup()
- * css_tryget_online()
- * rcu_read_unlock()
- * disable css_tryget_online()
- * call_rcu()
- * offline_css()
- * reparent_charges()
- * res_counter_charge()
- * css_put()
- * css_free()
- * pc->mem_cgroup = dead memcg
- * add page to lru
- *
- * The bulk of the charges are still moved in offline_css() to
- * avoid pinning a lot of pages in case a long-term reference
- * like a swapout record is deferring the css_free() to long
- * after offlining. But this makes sure we catch any charges
- * made after offlining:
- */
- mem_cgroup_reparent_charges(memcg);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- memcg_destroy_kmem(memcg);
- __mem_cgroup_free(memcg);
+ kmem_cgroup_destroy(memcg);
+
+ mem_cgroup_put(memcg);
}
#ifdef CONFIG_MMU
@@ -6451,7 +6207,8 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+ ret = __mem_cgroup_try_charge(NULL,
+ GFP_KERNEL, 1, &memcg, false);
if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
@@ -6522,7 +6279,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
- page = find_get_page(swap_address_space(ent), ent.val);
+ page = find_get_page(&swapper_space, ent.val);
if (do_swap_account)
entry->val = ent.val;
@@ -6555,20 +6312,16 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
+ page = find_get_page(mapping, pgoff);
+
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- if (do_swap_account)
- *entry = swp;
- page = find_get_page(swap_address_space(swp), swp.val);
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swap;
+ page = find_get_page(&swapper_space, swap.val);
+ }
#endif
return page;
}
@@ -6607,7 +6360,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
+ css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6629,7 +6382,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ VM_BUG_ON(!page || !PageHead(page));
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
@@ -6658,10 +6411,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -6716,7 +6469,6 @@ static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
- int i;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
@@ -6737,9 +6489,7 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
-
- for (i = 0; i < mc.moved_swap; i++)
- css_put(&mc.from->css);
+ __mem_cgroup_put(mc.from, mc.moved_swap);
if (!mem_cgroup_is_root(mc.to)) {
/*
@@ -6749,7 +6499,7 @@ static void __mem_cgroup_clear_mc(void)
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
- /* we've already done css_get(mc.to) */
+ /* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
@@ -6774,21 +6524,14 @@ static void mem_cgroup_clear_mc(void)
mem_cgroup_end_move(from);
}
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
struct cgroup_taskset *tset)
{
struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- unsigned long move_charge_at_immigrate;
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
- /*
- * We are now commited to this value whatever it is. Changes in this
- * tunable will only affect upcoming migrations, not the current one.
- * So we need to save it, and keep it going.
- */
- move_charge_at_immigrate = memcg->move_charge_at_immigrate;
- if (move_charge_at_immigrate) {
+ if (memcg->move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -6808,7 +6551,6 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
- mc.immigrate_flags = move_charge_at_immigrate;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -6821,7 +6563,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
return ret;
}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
struct cgroup_taskset *tset)
{
mem_cgroup_clear_mc();
@@ -6850,9 +6592,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6869,7 +6611,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -6969,7 +6711,7 @@ retry:
up_read(&mm->mmap_sem);
}
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
+static void mem_cgroup_move_task(struct cgroup *cont,
struct cgroup_taskset *tset)
{
struct task_struct *p = cgroup_taskset_first(tset);
@@ -6984,52 +6726,52 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
+static int mem_cgroup_can_attach(struct cgroup *cgroup,
struct cgroup_taskset *tset)
{
return 0;
}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
+static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
struct cgroup_taskset *tset)
{
}
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
+static void mem_cgroup_move_task(struct cgroup *cont,
struct cgroup_taskset *tset)
{
}
#endif
-/*
- * Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify sane_behavior flag on each mount attempt.
- */
-static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
-{
- /*
- * use_hierarchy is forced with sane_behavior. cgroup core
- * guarantees that @root doesn't have any children, so turning it
- * on for the root memcg is enough.
- */
- if (cgroup_sane_behavior(root_css->cgroup))
- mem_cgroup_from_css(root_css)->use_hierarchy = true;
-}
-
-struct cgroup_subsys memory_cgrp_subsys = {
+struct cgroup_subsys mem_cgroup_subsys = {
+ .name = "memory",
+ .subsys_id = mem_cgroup_subsys_id,
.css_alloc = mem_cgroup_css_alloc,
- .css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
+ .use_id = 1,
};
+/*
+ * The rest of init is performed during ->css_alloc() for root css which
+ * happens before initcalls. hotcpu_notifier() can't be done together as
+ * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
+ * dependency. Do it from a subsys_initcall().
+ */
+static int __init mem_cgroup_init(void)
+{
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);
+
#ifdef CONFIG_MEMCG_SWAP
static int __init enable_swap_account(char *s)
{
+ /* consider enabled if no parameter or 1 is given */
if (!strcmp(s, "1"))
really_do_swap_account = 1;
else if (!strcmp(s, "0"))
@@ -7038,39 +6780,4 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
-{
- WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
-}
-
-static void __init enable_swap_cgroup(void)
-{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- memsw_file_init();
- }
-}
-
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
#endif
-
-/*
- * subsys_initcall() for memory controller.
- *
- * Some parts like hotcpu_notifier() have to be initialized from this context
- * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
- * everything that doesn't depend on a specific mem_cgroup structure should
- * be initialized from here.
- */
-static int __init mem_cgroup_init(void)
-{
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
- enable_swap_cgroup();
- mem_cgroup_soft_limit_tree_init();
- memcg_stock_init();
- return 0;
-}
-subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6399e32893..c6e4dd3e1c0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
int sysctl_memory_failure_recovery __read_mostly = 1;
-atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
@@ -145,10 +145,14 @@ static int hwpoison_filter_task(struct page *p)
return -EINVAL;
css = mem_cgroup_css(mem);
- ino = cgroup_ino(css->cgroup);
+ /* root_mem_cgroup has NULL dentries */
+ if (!css->cgroup->dentry)
+ return -EINVAL;
+
+ ino = css->cgroup->dentry->d_inode->i_ino;
css_put(css);
- if (!ino || ino != hwpoison_filter_memcg)
+ if (ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -202,11 +206,11 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
+ if ((flags & MF_ACTION_REQUIRED) && t == current) {
si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, current);
+ ret = force_sig_info(SIGBUS, &si, t);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -244,12 +248,10 @@ void shake_page(struct page *p, int access)
*/
if (access) {
int nr;
- int nid = page_to_nid(p);
do {
struct shrink_control shrink = {
.gfp_mask = GFP_KERNEL,
};
- node_set(nid, shrink.nodes_to_scan);
nr = shrink_slab(&shrink, 1000, 1000);
if (page_count(p) == 1)
@@ -380,51 +382,20 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
}
}
-/*
- * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
- * on behalf of the thread group. Return task_struct of the (first found)
- * dedicated thread if found, and return NULL otherwise.
- *
- * We already hold read_lock(&tasklist_lock) in the caller, so we don't
- * have to call rcu_read_lock/unlock() in this function.
- */
-static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
+static int task_early_kill(struct task_struct *tsk)
{
- struct task_struct *t;
-
- for_each_thread(tsk, t)
- if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
- return t;
- return NULL;
-}
-
-/*
- * Determine whether a given process is "early kill" process which expects
- * to be signaled when some page under the process is hwpoisoned.
- * Return task_struct of the dedicated thread (main thread unless explicitly
- * specified) if the process is "early kill," and otherwise returns NULL.
- */
-static struct task_struct *task_early_kill(struct task_struct *tsk,
- int force_early)
-{
- struct task_struct *t;
if (!tsk->mm)
- return NULL;
- if (force_early)
- return tsk;
- t = find_early_kill_thread(tsk);
- if (t)
- return t;
- if (sysctl_memory_failure_early_kill)
- return tsk;
- return NULL;
+ return 0;
+ if (tsk->flags & PF_MCE_PROCESS)
+ return !!(tsk->flags & PF_MCE_EARLY);
+ return sysctl_memory_failure_early_kill;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ struct to_kill **tkc)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -439,17 +410,16 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
- struct task_struct *t = task_early_kill(tsk, force_early);
- if (!t)
+ if (!task_early_kill(tsk))
continue;
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ if (vma->vm_mm == tsk->mm)
+ add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -460,7 +430,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc, int force_early)
+ struct to_kill **tkc)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -470,10 +440,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
for_each_process(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct task_struct *t = task_early_kill(tsk, force_early);
- if (!t)
+ if (!task_early_kill(tsk))
continue;
+
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
@@ -483,8 +453,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill, tkc);
+ if (vma->vm_mm == tsk->mm)
+ add_to_kill(tsk, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -497,8 +467,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill,
- int force_early)
+static void collect_procs(struct page *page, struct list_head *tokill)
{
struct to_kill *tk;
@@ -509,9 +478,9 @@ static void collect_procs(struct page *page, struct list_head *tokill,
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk, force_early);
+ collect_procs_anon(page, tokill, &tk);
else
- collect_procs_file(page, tokill, &tk, force_early);
+ collect_procs_file(page, tokill, &tk);
kfree(tk);
}
@@ -640,7 +609,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty pagecache page
+ * Dirty cache page page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -815,11 +784,11 @@ static struct page_state {
{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },
- { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
-
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
+
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
@@ -885,17 +854,17 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags, struct page **hpagep)
+ int trapno, int flags)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = *hpagep;
+ struct page *hpage = compound_head(p);
struct page *ppage;
- if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
+ if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
/*
@@ -967,21 +936,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
- /*
- * We pinned the head page for hwpoison handling,
- * now we split the thp and we are interested in
- * the hwpoisoned raw page, so move the refcount
- * to it. Similarly, page lock is shifted.
- */
- if (hpage != p) {
- if (!(flags & MF_COUNT_INCREASED)) {
- put_page(hpage);
- get_page(p);
- }
- lock_page(p);
- unlock_page(hpage);
- *hpagep = p;
- }
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
}
@@ -996,13 +950,19 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(ppage, &tokill);
+
+ if (hpage != ppage)
+ lock_page(ppage);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
+ if (hpage != ppage)
+ unlock_page(ppage);
+
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1023,7 +983,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
static void set_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
SetPageHWPoison(hpage + i);
}
@@ -1031,7 +991,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
static void clear_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
ClearPageHWPoison(hpage + i);
}
@@ -1061,7 +1021,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
struct page *hpage;
int res;
unsigned int nr_pages;
- unsigned long page_flags;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1080,18 +1039,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- /*
- * Currently errors on hugetlbfs pages are measured in hugepage units,
- * so nr_pages should be 1 << compound_order. OTOH when errors are on
- * transparent hugepages, they are supposed to be split and error
- * measurement is done in normal page units. So nr_pages should be one
- * in this case.
- */
- if (PageHuge(p))
- nr_pages = 1 << compound_order(hpage);
- else /* normal page or thp */
- nr_pages = 1;
- atomic_long_add(nr_pages, &num_poisoned_pages);
+ nr_pages = 1 << compound_trans_order(hpage);
+ atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -1114,16 +1063,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "filter hit" and "race with other subpage."
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
*/
lock_page(hpage);
- if (PageHWPoison(hpage)) {
- if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- unlock_page(hpage);
- return 0;
- }
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1153,47 +1101,39 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* shake_page could have turned it free.
*/
if (is_free_buddy_page(p)) {
- if (flags & MF_COUNT_INCREASED)
- action_result(pfn, "free buddy", DELAYED);
- else
- action_result(pfn, "free buddy, 2nd try", DELAYED);
+ action_result(pfn, "free buddy, 2nd try",
+ DELAYED);
return 0;
}
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
}
}
- lock_page(hpage);
-
/*
- * We use page flags to determine what action should be taken, but
- * the flags can be modified by the error containment action. One
- * example is an mlocked page, where PG_mlocked is cleared by
- * page_remove_rmap() in try_to_unmap_one(). So to determine page status
- * correctly, we save a copy of the page flags at this time.
+ * Lock the page and wait for writeback to finish.
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
*/
- page_flags = p->flags;
+ lock_page(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(hpage);
res = 0;
goto out;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
unlock_page(hpage);
put_page(hpage);
return 0;
}
- if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
- goto identify_page_state;
-
/*
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
@@ -1214,21 +1154,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p))
set_page_hwpoison_huge_page(hpage);
- /*
- * It's very difficult to mess with pages currently under IO
- * and in many cases impossible, so we just avoid it here.
- */
wait_on_page_writeback(p);
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
- != SWAP_SUCCESS) {
+ if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
res = -EBUSY;
goto out;
@@ -1243,24 +1175,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
goto out;
}
-identify_page_state:
res = -EBUSY;
- /*
- * The first check uses the current page flags which may not have any
- * relevant information. The second check with the saved page flagss is
- * carried out only if the first check can't determine the page status.
- */
- for (ps = error_states;; ps++)
- if ((p->flags & ps->mask) == ps->res)
+ for (ps = error_states;; ps++) {
+ if ((p->flags & ps->mask) == ps->res) {
+ res = page_action(ps, p, pfn);
break;
-
- page_flags |= (p->flags & (1UL << PG_dirty));
-
- if (!ps->mask)
- for (ps = error_states;; ps++)
- if ((page_flags & ps->mask) == ps->res)
- break;
- res = page_action(ps, p, pfn);
+ }
+ }
out:
unlock_page(hpage);
return res;
@@ -1314,10 +1235,10 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags)
mf_cpu = &get_cpu_var(memory_failure_cpu);
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
- if (kfifo_put(&mf_cpu->fifo, entry))
+ if (kfifo_put(&mf_cpu->fifo, &entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
- pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
pfn);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
@@ -1331,17 +1252,14 @@ static void memory_failure_work_func(struct work_struct *work)
unsigned long proc_flags;
int gotten;
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ mf_cpu = &__get_cpu_var(memory_failure_cpu);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
if (!gotten)
break;
- if (entry.flags & MF_SOFT_OFFLINE)
- soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
- else
- memory_failure(entry.pfn, entry.trapno, entry.flags);
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
}
}
@@ -1391,17 +1309,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- /*
- * unpoison_memory() can encounter thp only when the thp is being
- * worked by memory_failure() and the page lock is not held yet.
- * In such case, we yield to memory_failure() and make unpoison fail.
- */
- if (!PageHuge(page) && PageTransHuge(page)) {
- pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
- return 0;
- }
-
- nr_pages = 1 << compound_order(page);
+ nr_pages = 1 << compound_trans_order(page);
if (!get_page_unless_zero(page)) {
/*
@@ -1415,7 +1323,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
if (TestClearPageHWPoison(p))
- atomic_long_dec(&num_poisoned_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1429,7 +1337,7 @@ int unpoison_memory(unsigned long pfn)
*/
if (TestClearPageHWPoison(page)) {
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
@@ -1437,7 +1345,7 @@ int unpoison_memory(unsigned long pfn)
unlock_page(page);
put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
+ if (freeit)
put_page(page);
return 0;
@@ -1460,7 +1368,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
* that is not free, and 1 for any other page type.
* For 1 the page is returned with increased page count, otherwise not.
*/
-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
+static int get_any_page(struct page *p, unsigned long pfn, int flags)
{
int ret;
@@ -1468,15 +1376,28 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
+ * The lock_memory_hotplug prevents a race with memory hotplug.
+ * This is a big hammer, a better would be nicer.
+ */
+ lock_memory_hotplug();
+
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free.
+ */
+ set_migratetype_isolate(p, true);
+ /*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
if (!get_page_unless_zero(compound_head(p))) {
if (PageHuge(p)) {
pr_info("%s: %#lx free huge page\n", __func__, pfn);
- ret = 0;
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
} else if (is_free_buddy_page(p)) {
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
+ /* Set hwpoison bit while page is still isolated */
+ SetPageHWPoison(p);
ret = 0;
} else {
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1487,30 +1408,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
- return ret;
-}
-
-static int get_any_page(struct page *page, unsigned long pfn, int flags)
-{
- int ret = __get_any_page(page, pfn, flags);
-
- if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
- /*
- * Try to free it.
- */
- put_page(page);
- shake_page(page, 1);
-
- /*
- * Did it turn free?
- */
- ret = __get_any_page(page, pfn, 0);
- if (!PageLRU(page)) {
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
- return -EIO;
- }
- }
+ unset_migratetype_isolate(p, MIGRATE_MOVABLE);
+ unlock_memory_hotplug();
return ret;
}
@@ -1519,70 +1418,120 @@ static int soft_offline_huge_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
if (PageHWPoison(hpage)) {
- unlock_page(hpage);
put_page(hpage);
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
- unlock_page(hpage);
/* Keep page count to indicate a given hugepage is isolated. */
- list_move(&hpage->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
+ MIGRATE_SYNC);
+ put_page(hpage);
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
- /*
- * We know that soft_offline_huge_page() tries to migrate
- * only one hugepage pointed to by hpage, so we need not
- * run through the pagelist here.
- */
- putback_active_hugepage(hpage);
- if (ret > 0)
- ret = -EIO;
- } else {
- /* overcommit hugetlb page will be freed to buddy */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
- } else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
- }
+ return ret;
}
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_trans_order(hpage),
+ &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
return ret;
}
-static int __soft_offline_page(struct page *page, int flags)
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_trans_head(page);
+
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+ if (PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
/*
- * Check PageHWPoison again inside page lock because PageHWPoison
- * is set by memory_failure() outside page lock. Note that
- * memory_failure() also double-checks PageHWPoison inside page lock,
- * so there's no race between soft_offline_page() and memory_failure().
+ * Page cache page we can handle?
*/
+ if (!PageLRU(page)) {
+ /*
+ * Try to free it.
+ */
+ put_page(page);
+ shake_page(page, 1);
+
+ /*
+ * Did it turn free?
+ */
+ ret = get_any_page(page, pfn, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+ }
+ if (!PageLRU(page)) {
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pfn, page->flags);
+ return -EIO;
+ }
+
lock_page(page);
wait_on_page_writeback(page);
+
+ /*
+ * Synchronized using the page lock with memory_failure()
+ */
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
+
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
@@ -1595,10 +1544,9 @@ static int __soft_offline_page(struct page *page, int flags)
*/
if (ret == 1) {
put_page(page);
+ ret = 0;
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
- return 0;
+ goto done;
}
/*
@@ -1615,116 +1563,28 @@ static int __soft_offline_page(struct page *page, int flags)
if (!ret) {
LIST_HEAD(pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ false, MIGRATE_SYNC,
+ MR_MEMORY_FAILURE);
if (ret) {
- if (!list_empty(&pagelist)) {
- list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
-
+ putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- } else {
- /*
- * After page migration succeeds, the source page can
- * be trapped in pagevec and actual freeing is delayed.
- * Freeing code works differently based on PG_hwpoison,
- * so there's a race. We need to make sure that the
- * source page should be freed back to buddy before
- * setting PG_hwpoison.
- */
- if (!is_free_buddy_page(page))
- lru_add_drain_all();
- if (!is_free_buddy_page(page))
- drain_all_pages();
- SetPageHWPoison(page);
- if (!is_free_buddy_page(page))
- pr_info("soft offline: %#lx: page leaked\n",
- pfn);
- atomic_long_inc(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
- return ret;
-}
+ if (ret)
+ return ret;
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
-{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
-
- if (PageHWPoison(page)) {
- pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
- }
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
- pr_info("soft offline: %#lx: failed to split THP\n",
- pfn);
- return -EBUSY;
- }
- }
-
- get_online_mems();
-
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- set_migratetype_isolate(page, true);
-
- ret = get_any_page(page, pfn, flags);
- put_online_mems();
- if (ret > 0) { /* for in-use pages */
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- } else if (ret == 0) { /* for free pages */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
- } else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
- }
- }
- unset_migratetype_isolate(page, MIGRATE_MOVABLE);
+done:
+ atomic_long_add(1, &mce_bad_pages);
+ SetPageHWPoison(page);
+ /* keep elevated page count for bad page */
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index d67fd9fcf1f..bb1369f7b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,8 +59,6 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
-#include <linux/dma-debug.h>
-#include <linux/debugfs.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -71,10 +69,6 @@
#include "internal.h"
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
-#endif
-
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
@@ -84,6 +78,7 @@ EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif
+unsigned long num_physpages;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -93,6 +88,7 @@ EXPORT_SYMBOL(mem_map);
*/
void * high_memory;
+EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
/*
@@ -211,16 +207,15 @@ static int tlb_next_batch(struct mmu_gather *tlb)
* tear-down from @mm. The @fullmm argument is used when @mm is without
* users and we're going to destroy the full address space (exit/execve).
*/
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
{
tlb->mm = mm;
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
- tlb->need_flush_all = 0;
- tlb->start = start;
- tlb->end = end;
+ tlb->fullmm = fullmm;
+ tlb->start = -1UL;
+ tlb->end = 0;
tlb->need_flush = 0;
+ tlb->fast_mode = (num_possible_cpus() == 1);
tlb->local.next = NULL;
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
@@ -232,18 +227,20 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
#endif
}
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+void tlb_flush_mmu(struct mmu_gather *tlb)
{
+ struct mmu_gather_batch *batch;
+
+ if (!tlb->need_flush)
+ return;
tlb->need_flush = 0;
tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
-}
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
+ if (tlb_fast_mode(tlb))
+ return;
for (batch = &tlb->local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
@@ -252,14 +249,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
tlb->active = &tlb->local;
}
-void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- if (!tlb->need_flush)
- return;
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
-}
-
/* tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
@@ -268,6 +257,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
+ tlb->start = start;
+ tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -292,6 +283,11 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
VM_BUG_ON(!tlb->need_flush);
+ if (tlb_fast_mode(tlb)) {
+ free_page_and_swap_cache(page);
+ return 1; /* avoid calling tlb_flush_mmu() */
+ }
+
batch = tlb->active;
batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
@@ -299,7 +295,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return 0;
batch = tlb->active;
}
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+ VM_BUG_ON(batch->nr > batch->max);
return batch->max - batch->nr;
}
@@ -384,6 +380,30 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none. Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+ pud_ERROR(*pud);
+ pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+}
+
+/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
@@ -393,7 +413,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
- atomic_long_dec(&tlb->mm->nr_ptes);
+ tlb->mm->nr_ptes--;
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -464,6 +484,8 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -561,7 +583,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long address)
{
- spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
int wait_split_huge_page;
if (!new)
@@ -582,15 +603,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
- atomic_long_inc(&mm->nr_ptes);
+ mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
} else if (unlikely(pmd_trans_splitting(*pmd)))
wait_split_huge_page = 1;
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
if (wait_split_huge_page)
@@ -681,7 +702,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
- dump_page(page, "bad pte");
+ dump_page(page);
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -689,13 +710,18 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
if (vma->vm_ops)
- printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
- vma->vm_ops->fault);
- if (vma->vm_file)
- printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
- vma->vm_file->f_op->mmap);
+ print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
+ (unsigned long)vma->vm_ops->fault);
+ if (vma->vm_file && vma->vm_file->f_op)
+ print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
+ (unsigned long)vma->vm_file->f_op->mmap);
dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ add_taint(TAINT_BAD_PAGE);
+}
+
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
@@ -751,7 +777,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte) || pte_numa(pte)))
+ if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -777,15 +803,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
- if (is_zero_pfn(pfn))
- return NULL;
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
@@ -843,8 +868,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
- pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -1123,31 +1146,25 @@ again:
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
- addr) != page->index) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(ptent))
- pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, addr, pte, ptfile);
- }
+ addr) != page->index)
+ set_pte_at(mm, addr, pte,
+ pgoff_to_pte(page->index));
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
- if (pte_dirty(ptent)) {
- force_flush = 1;
+ if (pte_dirty(ptent))
set_page_dirty(page);
- }
if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
+ likely(!VM_SequentialReadHint(vma)))
mark_page_accessed(page);
rss[MM_FILEPAGES]--;
}
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- if (unlikely(!__tlb_remove_page(tlb, page))) {
- force_flush = 1;
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
break;
- }
continue;
}
/*
@@ -1182,34 +1199,21 @@ again:
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
-
- /* Do the actual TLB flush before dropping ptl */
- if (force_flush) {
- unsigned long old_end;
-
- /*
- * Flush the TLB just for the previous segment,
- * then update the range to be the remaining
- * TLB range.
- */
- old_end = tlb->end;
- tlb->end = addr;
- tlb_flush_mmu_tlbonly(tlb);
- tlb->start = addr;
- tlb->end = old_end;
- }
pte_unmap_unlock(start_pte, ptl);
/*
- * If we forced a TLB flush (either due to running out of
- * batch buffers or because we needed to flush dirty TLB
- * entries before releasing the ptl), free the batched
- * memory too. Restart if we didn't do everything.
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
*/
if (force_flush) {
force_flush = 0;
- tlb_flush_mmu_free(tlb);
+#ifdef HAVE_GENERIC_MMU_GATHER
+ tlb->start = addr;
+ tlb->end = end;
+#endif
+ tlb_flush_mmu(tlb);
if (addr != end)
goto again;
}
@@ -1332,9 +1336,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
- * cleanup path of mmap_region. When
+ * cleanup path of do_mmap_pgoff. When
* hugetlbfs ->mmap method fails,
- * mmap_region() nullifies vma->vm_file
+ * do_mmap_pgoff() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
@@ -1396,7 +1400,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
@@ -1422,7 +1426,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, address, end);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, address, end);
unmap_single_vma(&tlb, vma, address, end, details);
@@ -1453,6 +1457,573 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
+/**
+ * follow_page - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ goto out;
+ }
+
+ page = NULL;
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto no_page_table;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ goto no_page_table;
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+ goto out;
+ }
+ if (unlikely(pud_bad(*pud)))
+ goto no_page_table;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto no_page_table;
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ BUG_ON(flags & FOLL_GET);
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+ goto out;
+ }
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto no_page_table;
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(vma, address, pmd);
+ goto split_fallthrough;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(vma, address,
+ pmd, flags);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ /* fall through */
+ }
+split_fallthrough:
+ if (unlikely(pmd_bad(*pmd)))
+ goto no_page_table;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+ pte = *ptep;
+ if (!pte_present(pte))
+ goto no_page;
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+
+ page = vm_normal_page(vma, address, pte);
+ if (unlikely(!page)) {
+ if ((flags & FOLL_DUMP) ||
+ !is_zero_pfn(pte_pfn(pte)))
+ goto bad_page;
+ page = pte_page(pte);
+ }
+
+ if (flags & FOLL_GET)
+ get_page_foll(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ /*
+ * pte_mkyoung() would be more correct here, but atomic care
+ * is needed to avoid losing the dirty bit: it is easier to use
+ * mark_page_accessed().
+ */
+ mark_page_accessed(page);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return page;
+
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return page;
+
+no_page_table:
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate unnecessary pages or
+ * page tables. Return error instead of NULL to skip handle_mm_fault,
+ * then get_dump_page() will return NULL to leave a hole in the dump.
+ * But we can only make this optimization where a hole would surely
+ * be zero-filled if handle_mm_fault() actually did handle it.
+ */
+ if ((flags & FOLL_DUMP) &&
+ (!vma->vm_ops || !vma->vm_ops->fault))
+ return ERR_PTR(-EFAULT);
+ return page;
+}
+
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return stack_guard_page_start(vma, addr) ||
+ stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, unsigned int gup_flags,
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking)
+{
+ int i;
+ unsigned long vm_flags;
+
+ if (nr_pages <= 0)
+ return 0;
+
+ VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+
+ /*
+ * Require read or write permissions.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
+ */
+ vm_flags = (gup_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (gup_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+ * would be called on PROT_NONE ranges. We must never invoke
+ * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+ * page faults would unprotect the PROT_NONE ranges if
+ * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+ * bitflag. So to avoid that, don't set FOLL_NUMA if
+ * FOLL_FORCE is set.
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
+ i = 0;
+
+ do {
+ struct vm_area_struct *vma;
+
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ unsigned long pg = start & PAGE_MASK;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* user gate pages are read-only */
+ if (gup_flags & FOLL_WRITE)
+ return i ? : -EFAULT;
+ if (pg > TASK_SIZE)
+ pgd = pgd_offset_k(pg);
+ else
+ pgd = pgd_offset_gate(mm, pg);
+ BUG_ON(pgd_none(*pgd));
+ pud = pud_offset(pgd, pg);
+ BUG_ON(pud_none(*pud));
+ pmd = pmd_offset(pud, pg);
+ if (pmd_none(*pmd))
+ return i ? : -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map(pmd, pg);
+ if (pte_none(*pte)) {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ vma = get_gate_vma(mm);
+ if (pages) {
+ struct page *page;
+
+ page = vm_normal_page(vma, start, *pte);
+ if (!page) {
+ if (!(gup_flags & FOLL_DUMP) &&
+ is_zero_pfn(pte_pfn(*pte)))
+ page = pte_page(*pte);
+ else {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ }
+ pages[i] = page;
+ get_page(page);
+ }
+ pte_unmap(pte);
+ goto next_page;
+ }
+
+ if (!vma ||
+ (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+ return i ? : -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i, gup_flags);
+ continue;
+ }
+
+ do {
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+
+ /*
+ * If we have a pending SIGKILL, don't keep faulting
+ * pages and potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
+
+ cond_resched();
+ while (!(page = follow_page(vma, start, foll_flags))) {
+ int ret;
+ unsigned int fault_flags = 0;
+
+ /* For mlock, just skip the stack guard page. */
+ if (foll_flags & FOLL_MLOCK) {
+ if (stack_guard_page(vma, start))
+ goto next_page;
+ }
+ if (foll_flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (foll_flags & FOLL_NOWAIT)
+ fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
+
+ ret = handle_mm_fault(mm, vma, start,
+ fault_flags);
+
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return i ? i : -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON |
+ VM_FAULT_HWPOISON_LARGE)) {
+ if (i)
+ return i;
+ else if (gup_flags & FOLL_HWPOISON)
+ return -EHWPOISON;
+ else
+ return -EFAULT;
+ }
+ if (ret & VM_FAULT_SIGBUS)
+ return i ? i : -EFAULT;
+ BUG();
+ }
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ return i;
+ }
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that
+ * do_wp_page has broken COW when necessary,
+ * even if maybe_mkwrite decided not to set
+ * pte_write. We can thus safely do subsequent
+ * page lookups as if they were reads. But only
+ * do so when looping for pte_write is futile:
+ * in some cases userspace may also be wanting
+ * to write to the gotten user page, which a
+ * read fault here might prevent (a readonly
+ * page might get reCOWed by userspace write).
+ */
+ if ((ret & VM_FAULT_WRITE) &&
+ !(vma->vm_flags & VM_WRITE))
+ foll_flags &= ~FOLL_WRITE;
+
+ cond_resched();
+ }
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+ if (pages) {
+ pages[i] = page;
+
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ }
+next_page:
+ if (vmas)
+ vmas[i] = vma;
+ i++;
+ start += PAGE_SIZE;
+ nr_pages--;
+ } while (nr_pages && start < vma->vm_end);
+ } while (nr_pages);
+ return i;
+}
+EXPORT_SYMBOL(__get_user_pages);
+
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
+ * get_user_pages() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to by the caller
+ * @force: whether to force write access even if user mapping is
+ * readonly. This will result in the page being COWed even
+ * in MAP_SHARED mappings. You do not want this.
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = FOLL_TOUCH;
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ if (__get_user_pages(current, current->mm, addr, 1,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
+ return NULL;
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ return page;
+}
+#endif /* CONFIG_ELF_CORE */
+
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
@@ -1786,53 +2357,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(remap_pfn_range);
-/**
- * vm_iomap_memory - remap memory to userspace
- * @vma: user vma to map to
- * @start: start of area
- * @len: size of area
- *
- * This is a simplified io_remap_pfn_range() for common driver use. The
- * driver just needs to give us the physical memory range to be mapped,
- * we'll figure out the rest from the vma information.
- *
- * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
- * whatever write-combining details or similar.
- */
-int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
-{
- unsigned long vm_len, pfn, pages;
-
- /* Check that the physical memory area passed in looks valid */
- if (start + len < start)
- return -EINVAL;
- /*
- * You *really* shouldn't map things that aren't page-aligned,
- * but we've historically allowed it because IO memory might
- * just have smaller alignment.
- */
- len += start & ~PAGE_MASK;
- pfn = start >> PAGE_SHIFT;
- pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
- if (pfn + pages < pfn)
- return -EINVAL;
-
- /* We start the mapping 'vm_pgoff' pages into the area */
- if (vma->vm_pgoff > pages)
- return -EINVAL;
- pfn += vma->vm_pgoff;
- pages -= vma->vm_pgoff;
-
- /* Can we fit all of the mapping? */
- vm_len = vma->vm_end - vma->vm_start;
- if (vm_len >> PAGE_SHIFT > pages)
- return -EINVAL;
-
- /* Ok, let it rip */
- return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
-}
-EXPORT_SYMBOL(vm_iomap_memory);
-
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
@@ -1961,8 +2485,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
- debug_dma_assert_idle(src);
-
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
@@ -1988,38 +2510,6 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
}
/*
- * Notify the address space that the page is about to become writable so that
- * it can prohibit this or wait for the page to get into an appropriate state.
- *
- * We do this without the lock held, so that it can sleep if it needs to.
- */
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- struct vm_fault vmf;
- int ret;
-
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.page = page;
-
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
- return ret;
- if (unlikely(!(ret & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
- return 0; /* retry */
- }
- ret |= VM_FAULT_LOCKED;
- } else
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- return ret;
-}
-
-/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
@@ -2101,15 +2591,42 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ struct vm_fault vmf;
int tmp;
+
+ vmf.virtual_address = (void __user *)(address &
+ PAGE_MASK);
+ vmf.pgoff = old_page->index;
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ vmf.page = old_page;
+
+ /*
+ * Notify the address space that the page is about to
+ * become writable so that it can prohibit this or wait
+ * for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can
+ * sleep if it needs to.
+ */
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
+
+ tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+ if (unlikely(tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ ret = tmp;
+ goto unwritable_page;
}
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(old_page);
+ if (!old_page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(old_page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(old_page));
+
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
@@ -2129,14 +2646,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2154,11 +2663,11 @@ reuse:
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
- * do_shared_fault is protected similarly.
+ * __do_fault is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
@@ -2204,7 +2713,7 @@ gotten:
}
__SetPageUptodate(new_page);
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
@@ -2298,6 +2807,10 @@ oom:
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;
+
+unwritable_page:
+ page_cache_release(old_page);
+ return ret;
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2317,7 +2830,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
- vea = vba + vma_pages(vma) - 1;
+ vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
@@ -2406,7 +2919,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page, *swapcache;
+ struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -2457,11 +2970,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- swapcache = page;
goto out_release;
}
- swapcache = page;
locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2479,11 +2990,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (ksm_might_need_to_copy(page, vma, address)) {
+ swapcache = page;
+ page = ksm_does_need_to_copy(page, vma, address);
+
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ swapcache = NULL;
+ goto out_page;
+ }
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2527,13 +3043,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
exclusive = 1;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
- pte = pte_mksoft_dirty(pte);
set_pte_at(mm, address, page_table, pte);
- if (page == swapcache)
- do_page_add_anon_rmap(page, vma, address, exclusive);
- else /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
+ do_page_add_anon_rmap(page, vma, address, exclusive);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -2541,7 +3052,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
- if (page != swapcache) {
+ if (swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -2574,7 +3085,7 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
- if (page != swapcache) {
+ if (swapcache) {
unlock_page(swapcache);
page_cache_release(swapcache);
}
@@ -2650,14 +3161,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
- /*
- * The memory barrier inside __SetPageUptodate makes sure that
- * preceeding stores to the page contents become visible before
- * the set_pte_at() write.
- */
__SetPageUptodate(page);
- if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -2688,11 +3194,53 @@ oom:
return VM_FAULT_OOM;
}
-static int __do_fault(struct vm_area_struct *vma, unsigned long address,
- pgoff_t pgoff, unsigned int flags, struct page **page)
+/*
+ * __do_fault() tries to create a new page mapping. It aggressively
+ * tries to share with existing pages, but makes a separate copy if
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte neither mapped nor locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
+ pte_t *page_table;
+ spinlock_t *ptl;
+ struct page *page;
+ struct page *cow_page;
+ pte_t entry;
+ int anon = 0;
+ struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
+ int page_mkwrite = 0;
+
+ /*
+ * If we do COW later, allocate page befor taking lock_page()
+ * on the file cache page. This will reduce lock holding time.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!cow_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+ page_cache_release(cow_page);
+ return VM_FAULT_OOM;
+ }
+ } else
+ cow_page = NULL;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
@@ -2700,315 +3248,148 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY)))
+ goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- page_cache_release(vmf.page);
- return VM_FAULT_HWPOISON;
+ ret = VM_FAULT_HWPOISON;
+ goto uncharge_out;
}
+ /*
+ * For consistency in subsequent calls, make the faulted page always
+ * locked.
+ */
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
-
- *page = vmf.page;
- return ret;
-}
-
-/**
- * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
- *
- * @vma: virtual memory area
- * @address: user virtual address
- * @page: page to map
- * @pte: pointer to target page table entry
- * @write: true, if new entry is writable
- * @anon: true, if it's anonymous page
- *
- * Caller must hold page table lock relevant for @pte.
- *
- * Target users are page handler itself and implementations of
- * vm_ops->map_pages.
- */
-void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- struct page *page, pte_t *pte, bool write, bool anon)
-{
- pte_t entry;
-
- flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (write)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
- pte_mksoft_dirty(entry);
- if (anon) {
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- } else {
- inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
- page_add_file_rmap(page);
- }
- set_pte_at(vma->vm_mm, address, pte, entry);
-
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, pte);
-}
-
-static unsigned long fault_around_bytes = 65536;
-
-/*
- * fault_around_pages() and fault_around_mask() round down fault_around_bytes
- * to nearest page order. It's what do_fault_around() expects to see.
- */
-static inline unsigned long fault_around_pages(void)
-{
- return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE;
-}
-
-static inline unsigned long fault_around_mask(void)
-{
- return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK;
-}
-
-
-#ifdef CONFIG_DEBUG_FS
-static int fault_around_bytes_get(void *data, u64 *val)
-{
- *val = fault_around_bytes;
- return 0;
-}
-
-static int fault_around_bytes_set(void *data, u64 val)
-{
- if (val / PAGE_SIZE > PTRS_PER_PTE)
- return -EINVAL;
- fault_around_bytes = val;
- return 0;
-}
-DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
- fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
-
-static int __init fault_around_debugfs(void)
-{
- void *ret;
-
- ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
- &fault_around_bytes_fops);
- if (!ret)
- pr_warn("Failed to create fault_around_bytes in debugfs");
- return 0;
-}
-late_initcall(fault_around_debugfs);
-#endif
-
-/*
- * do_fault_around() tries to map few pages around the fault address. The hope
- * is that the pages will be needed soon and this will lower the number of
- * faults to handle.
- *
- * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
- * not ready to be mapped: not up-to-date, locked, etc.
- *
- * This function is called with the page table lock taken. In the split ptlock
- * case the page table lock only protects only those entries which belong to
- * the page table corresponding to the fault address.
- *
- * This function doesn't cross the VMA boundaries, in order to call map_pages()
- * only once.
- *
- * fault_around_pages() defines how many pages we'll try to map.
- * do_fault_around() expects it to return a power of two less than or equal to
- * PTRS_PER_PTE.
- *
- * The virtual address of the area that we map is naturally aligned to the
- * fault_around_pages() value (and therefore to page order). This way it's
- * easier to guarantee that we don't cross page table boundaries.
- */
-static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pgoff_t pgoff, unsigned int flags)
-{
- unsigned long start_addr;
- pgoff_t max_pgoff;
- struct vm_fault vmf;
- int off;
-
- start_addr = max(address & fault_around_mask(), vma->vm_start);
- off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
- pte -= off;
- pgoff -= off;
+ VM_BUG_ON(!PageLocked(vmf.page));
/*
- * max_pgoff is either end of page table or end of vma
- * or fault_around_pages() from pgoff, depending what is nearest.
+ * Should we do an early C-O-W break?
*/
- max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
- PTRS_PER_PTE - 1;
- max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
- pgoff + fault_around_pages() - 1);
-
- /* Check if it makes any sense to call ->map_pages */
- while (!pte_none(*pte)) {
- if (++pgoff > max_pgoff)
- return;
- start_addr += PAGE_SIZE;
- if (start_addr >= vma->vm_end)
- return;
- pte++;
- }
+ page = vmf.page;
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!(vma->vm_flags & VM_SHARED)) {
+ page = cow_page;
+ anon = 1;
+ copy_user_highpage(page, vmf.page, address, vma);
+ __SetPageUptodate(page);
+ } else {
+ /*
+ * If the page will be shareable, see if the backing
+ * address space wants to know that the page is about
+ * to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ unlock_page(page);
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+ if (unlikely(tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ ret = tmp;
+ goto unwritable_page;
+ }
+ if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ ret = 0; /* retry the fault */
+ unlock_page(page);
+ goto unwritable_page;
+ }
+ } else
+ VM_BUG_ON(!PageLocked(page));
+ page_mkwrite = 1;
+ }
+ }
- vmf.virtual_address = (void __user *) start_addr;
- vmf.pte = pte;
- vmf.pgoff = pgoff;
- vmf.max_pgoff = max_pgoff;
- vmf.flags = flags;
- vma->vm_ops->map_pages(vma, &vmf);
-}
+ }
-static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
-{
- struct page *fault_page;
- spinlock_t *ptl;
- pte_t *pte;
- int ret = 0;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
- * Let's call ->map_pages() first and use ->fault() as fallback
- * if page by the offset is not ready to be mapped (cold cache or
- * something).
+ * This silly early PAGE_DIRTY setting removes a race
+ * due to the bad i386 page protection. But it's valid
+ * for other architectures too.
+ *
+ * Note that if FAULT_FLAG_WRITE is set, we either now have
+ * an exclusive copy of the page, or this is a shared mapping,
+ * so we can make it writable and dirty to avoid having to
+ * handle that later.
*/
- if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- do_fault_around(vma, address, pte, pgoff, flags);
- if (!pte_same(*pte, orig_pte))
- goto unlock_out;
- pte_unmap_unlock(pte, ptl);
- }
-
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
+ /* Only go through if we didn't race with anybody else... */
+ if (likely(pte_same(*page_table, orig_pte))) {
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (flags & FAULT_FLAG_WRITE)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (anon) {
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
+ page_add_file_rmap(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ }
+ }
+ set_pte_at(mm, address, page_table, entry);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
- return ret;
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, page_table);
+ } else {
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
}
- do_set_pte(vma, address, fault_page, pte, false, false);
- unlock_page(fault_page);
-unlock_out:
- pte_unmap_unlock(pte, ptl);
- return ret;
-}
-static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
-{
- struct page *fault_page, *new_page;
- spinlock_t *ptl;
- pte_t *pte;
- int ret;
+ pte_unmap_unlock(page_table, ptl);
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
+ if (dirty_page) {
+ struct address_space *mapping = page->mapping;
+ int dirtied = 0;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (!new_page)
- return VM_FAULT_OOM;
+ if (set_page_dirty(dirty_page))
+ dirtied = 1;
+ unlock_page(dirty_page);
+ put_page(dirty_page);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping but still
+ * dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
- page_cache_release(new_page);
- return VM_FAULT_OOM;
+ /* file_update_time outside page_lock */
+ if (vma->vm_file && !page_mkwrite)
+ file_update_time(vma->vm_file);
+ } else {
+ unlock_page(vmf.page);
+ if (anon)
+ page_cache_release(vmf.page);
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- goto uncharge_out;
-
- copy_user_highpage(new_page, fault_page, address, vma);
- __SetPageUptodate(new_page);
+ return ret;
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
- goto uncharge_out;
- }
- do_set_pte(vma, address, new_page, pte, true, true);
- pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+unwritable_page:
+ page_cache_release(page);
return ret;
uncharge_out:
- mem_cgroup_uncharge_page(new_page);
- page_cache_release(new_page);
- return ret;
-}
-
-static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
-{
- struct page *fault_page;
- struct address_space *mapping;
- spinlock_t *ptl;
- pte_t *pte;
- int dirtied = 0;
- int ret, tmp;
-
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
-
- /*
- * Check if the backing address space wants to know that the page is
- * about to become writable
- */
- if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, address);
- if (unlikely(!tmp ||
- (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(fault_page);
- return tmp;
- }
- }
-
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (unlikely(!pte_same(*pte, orig_pte))) {
- pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
- return ret;
+ /* fs's fault handler get error */
+ if (cow_page) {
+ mem_cgroup_uncharge_page(cow_page);
+ page_cache_release(cow_page);
}
- do_set_pte(vma, address, fault_page, pte, true, false);
- pte_unmap_unlock(pte, ptl);
-
- if (set_page_dirty(fault_page))
- dirtied = 1;
- mapping = fault_page->mapping;
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- /* file_update_time outside page_lock */
- if (vma->vm_file && !vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
return ret;
}
@@ -3020,13 +3401,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
- if (!(flags & FAULT_FLAG_WRITE))
- return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3058,40 +3433,29 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
pgoff = pte_to_pgoff(orig_pte);
- if (!(flags & FAULT_FLAG_WRITE))
- return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid,
- int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int current_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == numa_node_id()) {
+ if (current_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- *flags |= TNF_FAULT_LOCAL;
- }
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
{
struct page *page = NULL;
spinlock_t *ptl;
- int page_nid = -1;
- int last_cpupid;
+ int current_nid = -1;
int target_nid;
bool migrated = false;
- int flags = 0;
/*
* The "pte" at this point cannot be used safely without
@@ -3118,45 +3482,124 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
- BUG_ON(is_zero_pfn(page_to_pfn(page)));
- /*
- * Avoid grouping on DSO/COW pages in specific and RO pages
- * in general, RO pages shouldn't hurt as much anyway since
- * they can be in shared cache state.
- */
- if (!pte_write(pte))
- flags |= TNF_NO_GROUP;
-
- /*
- * Flag if the page is shared between multiple address spaces. This
- * is later used when determining whether to group tasks together
- */
- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
- flags |= TNF_SHARED;
-
- last_cpupid = page_cpupid_last(page);
- page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
+ current_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, current_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
+ /*
+ * Account for the fault against the current node if it not
+ * being replaced regardless of where the page is located.
+ */
+ current_nid = numa_node_id();
put_page(page);
goto out;
}
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
- page_nid = target_nid;
- flags |= TNF_MIGRATED;
- }
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
out:
- if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, 1, flags);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, 1, migrated);
return 0;
}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+ int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int curr_nid = local_nid;
+ int target_nid;
+ bool migrated;
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ /*
+ * Note that the NUMA fault is later accounted to either
+ * the node that is currently running or where the page is
+ * migrated to.
+ */
+ curr_nid = local_nid;
+ target_nid = numa_migrate_prep(page, vma, addr,
+ page_to_nid(page));
+ if (target_nid == -1) {
+ put_page(page);
+ continue;
+ }
+
+ /* Migrate to the requested node */
+ pte_unmap_unlock(pte, ptl);
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ curr_nid = target_nid;
+ task_numa_fault(curr_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3170,7 +3613,7 @@ out:
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int handle_pte_fault(struct mm_struct *mm,
+int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
@@ -3229,17 +3672,26 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*/
-static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
+
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
+retry:
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
@@ -3248,12 +3700,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ return do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
int ret;
@@ -3277,16 +3726,26 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ /*
+ * If COW results in an oom, the huge pmd will
+ * have been split, so retry the fault on the
+ * pte for a smaller charge.
+ */
+ if (unlikely(ret & VM_FAULT_OOM))
+ goto retry;
+ return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
- return 0;
}
+
+ return 0;
}
}
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
+
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
@@ -3309,43 +3768,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
-{
- int ret;
-
- __set_current_state(TASK_RUNNING);
-
- count_vm_event(PGFAULT);
- mem_cgroup_count_vm_event(mm, PGFAULT);
-
- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
- /*
- * Enable the memcg OOM handling for faults triggered in user
- * space. Kernel faults are handled more gracefully.
- */
- if (flags & FAULT_FLAG_USER)
- mem_cgroup_oom_enable();
-
- ret = __handle_mm_fault(mm, vma, address, flags);
-
- if (flags & FAULT_FLAG_USER) {
- mem_cgroup_oom_disable();
- /*
- * The task may have entered a memcg OOM situation but
- * if the allocation error was handled gracefully (no
- * VM_FAULT_OOM), there is no need to kill anything.
- * Just clean up the OOM state peacefully.
- */
- if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
- mem_cgroup_oom_synchronize(false);
- }
-
- return ret;
-}
-
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
@@ -3399,6 +3821,30 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ return -ENOMEM;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
+ BUG_ON(addr >= end);
+ BUG_ON(end > vma->vm_end);
+ len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
+ ret = get_user_pages(current, current->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -EFAULT;
+}
+
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
@@ -3564,7 +4010,6 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
return len;
}
-EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
@@ -3701,7 +4146,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
up_read(&mm->mmap_sem);
}
-#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
+#ifdef CONFIG_PROVE_LOCKING
void might_fault(void)
{
/*
@@ -3713,17 +4158,13 @@ void might_fault(void)
if (segment_eq(get_fs(), KERNEL_DS))
return;
+ might_sleep();
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
- if (in_atomic())
- return;
-
- __might_sleep(__FILE__, __LINE__, 0);
-
- if (current->mm)
+ if (!in_atomic() && current->mm)
might_lock_read(&current->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
@@ -3799,30 +4240,3 @@ void copy_user_huge_page(struct page *dst, struct page *src,
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-
-#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
-
-static struct kmem_cache *page_ptl_cachep;
-
-void __init ptlock_cache_init(void)
-{
- page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
- SLAB_PANIC, NULL);
-}
-
-bool ptlock_alloc(struct page *page)
-{
- spinlock_t *ptl;
-
- ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
- if (!ptl)
- return false;
- page->ptl = ptl;
- return true;
-}
-
-void ptlock_free(struct page *page)
-{
- kmem_cache_free(page_ptl_cachep, page->ptl);
-}
-#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 469bbf505f8..d04ed87bfac 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,6 +9,7 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
+#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>
@@ -28,9 +29,6 @@
#include <linux/suspend.h>
#include <linux/mm_inline.h>
#include <linux/firmware-map.h>
-#include <linux/stop_machine.h>
-#include <linux/hugetlb.h>
-#include <linux/memblock.h>
#include <asm/tlbflush.h>
@@ -46,84 +44,23 @@
static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
-static DEFINE_MUTEX(online_page_callback_lock);
-/* The same as the cpu_hotplug lock, but for memory hotplug. */
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing mem hotplug operation.
- */
- int refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} mem_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
- .refcount = 0,
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "mem_hotplug.lock" },
-#endif
-};
-
-/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
-#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
-#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
-#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+DEFINE_MUTEX(mem_hotplug_mutex);
-void get_online_mems(void)
+void lock_memory_hotplug(void)
{
- might_sleep();
- if (mem_hotplug.active_writer == current)
- return;
- memhp_lock_acquire_read();
- mutex_lock(&mem_hotplug.lock);
- mem_hotplug.refcount++;
- mutex_unlock(&mem_hotplug.lock);
-
-}
-
-void put_online_mems(void)
-{
- if (mem_hotplug.active_writer == current)
- return;
- mutex_lock(&mem_hotplug.lock);
-
- if (WARN_ON(!mem_hotplug.refcount))
- mem_hotplug.refcount++; /* try to fix things up */
-
- if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
- wake_up_process(mem_hotplug.active_writer);
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
+ mutex_lock(&mem_hotplug_mutex);
+ /* for exclusive hibernation if CONFIG_HIBERNATION=y */
+ lock_system_sleep();
}
-static void mem_hotplug_begin(void)
+void unlock_memory_hotplug(void)
{
- mem_hotplug.active_writer = current;
-
- memhp_lock_acquire();
- for (;;) {
- mutex_lock(&mem_hotplug.lock);
- if (likely(!mem_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&mem_hotplug.lock);
- schedule();
- }
+ unlock_system_sleep();
+ mutex_unlock(&mem_hotplug_mutex);
}
-static void mem_hotplug_done(void)
-{
- mem_hotplug.active_writer = NULL;
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
-}
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
@@ -137,7 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->end = start + size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
- pr_debug("System RAM resource %pR cannot be added\n", res);
+ printk("System RAM resource %pR cannot be added\n", res);
kfree(res);
res = NULL;
}
@@ -154,8 +91,9 @@ static void release_memory_resource(struct resource *res)
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type)
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
{
page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
@@ -163,9 +101,12 @@ void get_page_bootmem(unsigned long info, struct page *page,
atomic_inc(&page->_count);
}
-void put_page_bootmem(struct page *page)
+/* reference to __meminit __free_pages_bootmem is valid
+ * so use __ref to tell modpost not to generate a warning */
+void __ref put_page_bootmem(struct page *page)
{
unsigned long type;
+ static DEFINE_MUTEX(ppb_lock);
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -175,12 +116,18 @@ void put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
- free_reserved_page(page);
+
+ /*
+ * Please refer to comment for __free_pages_bootmem()
+ * for why we serialize here.
+ */
+ mutex_lock(&ppb_lock);
+ __free_pages_bootmem(page, 0);
+ mutex_unlock(&ppb_lock);
}
+
}
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
unsigned long *usemap, mapsize, section_nr, i;
@@ -214,32 +161,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
-#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long *usemap, mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
-
- if (!pfn_valid(start_pfn))
- return;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
-
- usemap = __nr_to_section(section_nr)->pageblock_flags;
- page = virt_to_page(usemap);
-
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
@@ -256,7 +177,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
zone = &pgdat->node_zones[0];
for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
- if (zone_is_initialized(zone)) {
+ if (zone->wait_table) {
nr_pages = zone->wait_table_hash_nr_entries
* sizeof(wait_queue_head_t);
nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
@@ -268,21 +189,21 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
}
pfn = pgdat->node_start_pfn;
- end_pfn = pgdat_end_pfn(pgdat);
+ end_pfn = pfn + pgdat->node_spanned_pages;
- /* register section info */
+ /* register_section info */
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
/*
* Some platforms can assign the same pfn to multiple nodes - on
* node0 as well as nodeN. To avoid registering a pfn against
* multiple nodes we check that this pfn does not already
- * reside in some other nodes.
+ * reside in some other node.
*/
if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
register_page_bootmem_info_section(pfn);
}
}
-#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
@@ -291,8 +212,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writelock(zone);
- old_zone_end_pfn = zone_end_pfn(zone);
- if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -332,17 +253,6 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
set_page_links(pfn_to_page(pfn), zid, nid, pfn);
}
-/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
-static int __ref ensure_zone_is_initialized(struct zone *zone,
- unsigned long start_pfn, unsigned long num_pages)
-{
- if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages,
- MEMMAP_HOTPLUG);
- return 0;
-}
-
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
unsigned long start_pfn, unsigned long end_pfn)
{
@@ -350,16 +260,19 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
unsigned long flags;
unsigned long z1_start_pfn;
- ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
+ if (!z1->wait_table) {
+ ret = init_currently_empty_zone(z1, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
pgdat_resize_lock(z1->zone_pgdat, &flags);
/* can't move pfns which are higher than @z2 */
- if (end_pfn > zone_end_pfn(z2))
+ if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
goto out_fail;
- /* the move out part must be at the left most of @z2 */
+ /* the move out part mast at the left most of @z2 */
if (start_pfn > z2->zone_start_pfn)
goto out_fail;
/* must included/overlap */
@@ -367,13 +280,13 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
goto out_fail;
/* use start_pfn for z1's start_pfn if z1 is empty */
- if (!zone_is_empty(z1))
+ if (z1->spanned_pages)
z1_start_pfn = z1->zone_start_pfn;
else
z1_start_pfn = start_pfn;
resize_zone(z1, z1_start_pfn, end_pfn);
- resize_zone(z2, end_pfn, zone_end_pfn(z2));
+ resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
pgdat_resize_unlock(z1->zone_pgdat, &flags);
@@ -392,9 +305,12 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
unsigned long flags;
unsigned long z2_end_pfn;
- ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
+ if (!z2->wait_table) {
+ ret = init_currently_empty_zone(z2, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
pgdat_resize_lock(z1->zone_pgdat, &flags);
@@ -402,15 +318,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
if (z1->zone_start_pfn > start_pfn)
goto out_fail;
/* the move out part mast at the right most of @z1 */
- if (zone_end_pfn(z1) > end_pfn)
+ if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
goto out_fail;
/* must included/overlap */
- if (start_pfn >= zone_end_pfn(z1))
+ if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
goto out_fail;
/* use end_pfn for z2's end_pfn if z2 is empty */
- if (!zone_is_empty(z2))
- z2_end_pfn = zone_end_pfn(z2);
+ if (z2->spanned_pages)
+ z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
else
z2_end_pfn = end_pfn;
@@ -430,7 +346,8 @@ out_fail:
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
@@ -446,13 +363,16 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int nid = pgdat->node_id;
int zone_type;
unsigned long flags;
- int ret;
zone_type = zone - pgdat->node_zones;
- ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
- if (ret)
- return ret;
+ if (!zone->wait_table) {
+ int ret;
+ ret = init_currently_empty_zone(zone, phys_start_pfn,
+ nr_pages, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
pgdat_resize_lock(zone->zone_pgdat, &flags);
grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -466,12 +386,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
static int __meminit __add_section(int nid, struct zone *zone,
unsigned long phys_start_pfn)
{
+ int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(zone, phys_start_pfn);
+ ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
if (ret < 0)
return ret;
@@ -484,6 +405,36 @@ static int __meminit __add_section(int nid, struct zone *zone,
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ /*
+ * XXX: Freeing memmap with vmemmap is not implement yet.
+ * This should be removed later.
+ */
+ return -EBUSY;
+}
+#else
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ unsigned long flags;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int ret = -EINVAL;
+
+ if (!valid_section(ms))
+ return ret;
+
+ ret = unregister_memory_section(ms);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(pgdat, &flags);
+ sparse_remove_one_section(zone, ms);
+ pgdat_resize_unlock(pgdat, &flags);
+ return 0;
+}
+#endif
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -517,230 +468,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__add_pages);
-#ifdef CONFIG_MEMORY_HOTREMOVE
-/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
-static int find_smallest_section_pfn(int nid, struct zone *zone,
- unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct mem_section *ms;
-
- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
- continue;
-
- if (unlikely(pfn_to_nid(start_pfn) != nid))
- continue;
-
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
- continue;
-
- return start_pfn;
- }
-
- return 0;
-}
-
-/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
-static int find_biggest_section_pfn(int nid, struct zone *zone,
- unsigned long start_pfn,
- unsigned long end_pfn)
-{
- struct mem_section *ms;
- unsigned long pfn;
-
- /* pfn is the end pfn of a memory section. */
- pfn = end_pfn - 1;
- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
- continue;
-
- if (unlikely(pfn_to_nid(pfn) != nid))
- continue;
-
- if (zone && zone != page_zone(pfn_to_page(pfn)))
- continue;
-
- return pfn;
- }
-
- return 0;
-}
-
-static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = zone_to_nid(zone);
-
- zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the zone, it need
- * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
- if (pfn) {
- zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
- }
- } else if (zone_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the zone, it need
- * shrink zone->spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
- start_pfn);
- if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
- }
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
- zone_span_writeunlock(zone);
-}
-
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = pgdat->node_id;
-
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
-
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
- continue;
-
- if (pfn_to_nid(pfn) != nid)
- continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- return;
- }
-
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
-}
-
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
- int zone_type;
- unsigned long flags;
-
- zone_type = zone - pgdat->node_zones;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
-}
-
-static int __remove_section(struct zone *zone, struct mem_section *ms)
-{
- unsigned long start_pfn;
- int scn_nr;
- int ret = -EINVAL;
-
- if (!valid_section(ms))
- return ret;
-
- ret = unregister_memory_section(ms);
- if (ret)
- return ret;
-
- scn_nr = __section_nr(ms);
- start_pfn = section_nr_to_pfn(scn_nr);
- __remove_zone(zone, start_pfn);
-
- sparse_remove_one_section(zone, ms);
- return 0;
-}
-
/**
* __remove_pages() - remove sections of pages from a zone
* @zone: zone from which pages need to be removed
@@ -755,10 +482,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long nr_pages)
{
- unsigned long i;
+ unsigned long i, ret = 0;
int sections_to_remove;
- resource_size_t start, size;
- int ret = 0;
/*
* We can only remove entire sections
@@ -766,15 +491,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
+ release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
@@ -786,22 +503,19 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
-#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- get_online_mems();
- mutex_lock(&online_page_callback_lock);
+ lock_memory_hotplug();
if (online_page_callback == generic_online_page) {
online_page_callback = callback;
rc = 0;
}
- mutex_unlock(&online_page_callback_lock);
- put_online_mems();
+ unlock_memory_hotplug();
return rc;
}
@@ -811,16 +525,14 @@ int restore_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- get_online_mems();
- mutex_lock(&online_page_callback_lock);
+ lock_memory_hotplug();
if (online_page_callback == callback) {
online_page_callback = generic_online_page;
rc = 0;
}
- mutex_unlock(&online_page_callback_lock);
- put_online_mems();
+ unlock_memory_hotplug();
return rc;
}
@@ -828,18 +540,29 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
void __online_page_set_limits(struct page *page)
{
+ unsigned long pfn = page_to_pfn(page);
+
+ if (pfn >= num_physpages)
+ num_physpages = pfn + 1;
}
EXPORT_SYMBOL_GPL(__online_page_set_limits);
void __online_page_increment_counters(struct page *page)
{
- adjust_managed_page_count(page, 1);
+ totalram_pages++;
+
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages++;
+#endif
}
EXPORT_SYMBOL_GPL(__online_page_increment_counters);
void __online_page_free(struct page *page)
{
- __free_reserved_page(page);
+ ClearPageReserved(page);
+ init_page_count(page);
+ __free_page(page);
}
EXPORT_SYMBOL_GPL(__online_page_free);
@@ -960,7 +683,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
- unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
@@ -968,7 +690,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- mem_hotplug_begin();
+ lock_memory_hotplug();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -976,18 +698,23 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
- ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
- !can_online_high_movable(zone))
- goto out;
+ !can_online_high_movable(zone)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
- if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- goto out;
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
}
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
- if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- goto out;
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
}
/* Previous code may changed the zone of the pfn range */
@@ -997,13 +724,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = page_to_nid(pfn_to_page(pfn));
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ unlock_memory_hotplug();
+ return ret;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -1027,15 +755,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ unlock_memory_hotplug();
+ return ret;
}
+ zone->managed_pages += onlined_pages;
zone->present_pages += onlined_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
zone->zone_pgdat->node_present_pages += onlined_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
-
if (onlined_pages) {
node_states_set_node(zone_to_nid(zone), &arg);
if (need_zonelists_rebuild)
@@ -1057,9 +783,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
-out:
- mem_hotplug_done();
- return ret;
+ unlock_memory_hotplug();
+
+ return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1069,16 +795,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
unsigned long zholes_size[MAX_NR_ZONES] = {0};
- unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
- pgdat = NODE_DATA(nid);
- if (!pgdat) {
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- return NULL;
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ return NULL;
- arch_refresh_nodedata(nid, pgdat);
- }
+ arch_refresh_nodedata(nid, pgdat);
/* we can use NODE_DATA(nid) from here */
@@ -1104,23 +827,17 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
-/**
- * try_online_node - online a node if offlined
- *
+/*
* called by cpu_up() to online a node without onlined memory.
*/
-int try_online_node(int nid)
+int mem_online_node(int nid)
{
pg_data_t *pgdat;
int ret;
- if (node_online(nid))
- return 0;
-
- mem_hotplug_begin();
+ lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
- pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
@@ -1128,65 +845,32 @@ int try_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
-
out:
- mem_hotplug_done();
+ unlock_memory_hotplug();
return ret;
}
-static int check_hotplug_memory_range(u64 start, u64 size)
-{
- u64 start_pfn = PFN_DOWN(start);
- u64 nr_pages = size >> PAGE_SHIFT;
-
- /* Memory range must be aligned with section */
- if ((start_pfn & ~PAGE_SECTION_MASK) ||
- (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
- pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
- (unsigned long long)start,
- (unsigned long long)size);
- return -EINVAL;
- }
-
- return 0;
-}
-
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
{
pg_data_t *pgdat = NULL;
- bool new_pgdat;
- bool new_node;
+ int new_pgdat = 0;
struct resource *res;
int ret;
- ret = check_hotplug_memory_range(start, size);
- if (ret)
- return ret;
+ lock_memory_hotplug();
res = register_memory_resource(start, size);
ret = -EEXIST;
if (!res)
- return ret;
-
- { /* Stupid hack to suppress address-never-null warning */
- void *p = NODE_DATA(nid);
- new_pgdat = !p;
- }
-
- mem_hotplug_begin();
+ goto out;
- new_node = !node_online(nid);
- if (new_node) {
+ if (!node_online(nid)) {
pgdat = hotadd_new_pgdat(nid, start);
ret = -ENOMEM;
if (!pgdat)
goto error;
+ new_pgdat = 1;
}
/* call arch's memory hotadd */
@@ -1198,7 +882,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
/* we online node here. we can't roll back from here. */
node_set_online(nid);
- if (new_node) {
+ if (new_pgdat) {
ret = register_one_node(nid);
/*
* If sysfs file of new node can't create, cpu on the node
@@ -1217,10 +901,11 @@ error:
/* rollback pgdat allocation and others */
if (new_pgdat)
rollback_node_hotadd(nid, pgdat);
- release_memory_resource(res);
+ if (res)
+ release_memory_resource(res);
out:
- mem_hotplug_done();
+ unlock_memory_hotplug();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -1300,12 +985,10 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
}
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
- * and hugepages). We scan pfn because it's much easier than scanning over
- * linked list. This function returns the pfn of the first found movable
- * page if it's found, otherwise 0.
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
*/
-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
@@ -1314,13 +997,6 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
page = pfn_to_page(pfn);
if (PageLRU(page))
return pfn;
- if (PageHuge(page)) {
- if (is_hugepage_active(page))
- return pfn;
- else
- pfn = round_up(pfn + 1,
- 1 << compound_order(page)) - 1;
- }
}
}
return 0;
@@ -1341,19 +1017,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
-
- if (PageHuge(page)) {
- struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
- if (compound_order(head) > PFN_SECTION_SHIFT) {
- ret = -EBUSY;
- break;
- }
- if (isolate_huge_page(page, &source))
- move_pages -= 1 << compound_order(head);
- continue;
- }
-
if (!get_page_unless_zero(page))
continue;
/*
@@ -1372,7 +1035,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
- dump_page(page, "failed to remove from LRU");
+ dump_page(page);
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
@@ -1386,7 +1049,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
}
if (!list_empty(&source)) {
if (not_managed) {
- putback_movable_pages(&source);
+ putback_lru_pages(&source);
goto out;
}
@@ -1394,10 +1057,11 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* alloc_migrate_target should be improooooved!!
* migrate_pages returns # of failed pages.
*/
- ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ ret = migrate_pages(&source, alloc_migrate_target, 0,
+ true, MIGRATE_SYNC,
+ MR_MEMORY_HOTPLUG);
if (ret)
- putback_movable_pages(&source);
+ putback_lru_pages(&source);
}
out:
return ret;
@@ -1485,37 +1149,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
}
#endif /* CONFIG_MOVABLE_NODE */
-static int __init cmdline_parse_movable_node(char *p)
-{
-#ifdef CONFIG_MOVABLE_NODE
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- memblock_set_bottom_up(true);
- movable_node_enabled = true;
-#else
- pr_warn("movable_node option not supported\n");
-#endif
- return 0;
-}
-early_param("movable_node", cmdline_parse_movable_node);
-
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1613,10 +1246,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long pfn, nr_pages, expire;
long offlined_pages;
int ret, drain, retry_max, node;
- unsigned long flags;
struct zone *zone;
struct memory_notify arg;
+ BUG_ON(start_pfn >= end_pfn);
/* at least, alignment against pageblock is necessary */
if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
return -EINVAL;
@@ -1627,7 +1260,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- mem_hotplug_begin();
+ lock_memory_hotplug();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -1671,8 +1304,8 @@ repeat:
drain_all_pages();
}
- pfn = scan_movable_pages(start_pfn, end_pfn);
- if (pfn) { /* We have movable pages */
+ pfn = scan_lru_pages(start_pfn, end_pfn);
+ if (pfn) { /* We have page on LRU */
ret = do_migrate_range(pfn, end_pfn);
if (!ret) {
drain = 1;
@@ -1691,11 +1324,6 @@ repeat:
yield();
/* drain pcp pages, this is synchronous. */
drain_all_pages();
- /*
- * dissolve free hugepages in the memory block before doing offlining
- * actually in order to make hugetlbfs's object counting consistent.
- */
- dissolve_free_huge_pages(start_pfn, end_pfn);
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
if (offlined_pages < 0) {
@@ -1709,12 +1337,10 @@ repeat:
/* reset pagetype flags and makes migrate type to be MOVABLE */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
+ zone->managed_pages -= offlined_pages;
zone->present_pages -= offlined_pages;
-
- pgdat_resize_lock(zone->zone_pgdat, &flags);
zone->zone_pgdat->node_present_pages -= offlined_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ totalram_pages -= offlined_pages;
init_per_zone_wmark_min();
@@ -1734,7 +1360,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- mem_hotplug_done();
+ unlock_memory_hotplug();
return 0;
failed_removal:
@@ -1746,7 +1372,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
out:
- mem_hotplug_done();
+ unlock_memory_hotplug();
return ret;
}
@@ -1754,28 +1380,18 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
-/**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory section walked
- *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
- *
- * Returns the return value of func.
- */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
- void *arg, int (*func)(struct memory_block *, void *))
+int remove_memory(u64 start, u64 size)
{
struct memory_block *mem = NULL;
struct mem_section *section;
+ unsigned long start_pfn, end_pfn;
unsigned long pfn, section_nr;
int ret;
+ start_pfn = PFN_DOWN(start);
+ end_pfn = start_pfn + PFN_DOWN(size);
+
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
section_nr = pfn_to_section_nr(pfn);
if (!present_section_nr(section_nr))
@@ -1792,7 +1408,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
if (!mem)
continue;
- ret = func(mem, arg);
+ ret = offline_memory_block(mem);
if (ret) {
kobject_put(&mem->dev.kobj);
return ret;
@@ -1804,172 +1420,14 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
return 0;
}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
-static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
-{
- int ret = !is_memblock_offlined(mem);
-
- if (unlikely(ret)) {
- phys_addr_t beginpa, endpa;
-
- beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
- pr_warn("removing memory fails, because memory "
- "[%pa-%pa] is onlined\n",
- &beginpa, &endpa);
- }
-
- return ret;
-}
-
-static int check_cpu_on_node(pg_data_t *pgdat)
-{
- int cpu;
-
- for_each_present_cpu(cpu) {
- if (cpu_to_node(cpu) == pgdat->node_id)
- /*
- * the cpu on this node isn't removed, and we can't
- * offline this node.
- */
- return -EBUSY;
- }
-
- return 0;
-}
-
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
- int cpu;
-
- for_each_possible_cpu(cpu)
- if (cpu_to_node(cpu) == pgdat->node_id)
- numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
- int ret;
-
- ret = check_cpu_on_node(pgdat);
- if (ret)
- return ret;
-
- /*
- * the node will be offlined when we come here, so we can clear
- * the cpu_to_node() now.
- */
-
- unmap_cpu_on_node(pgdat);
- return 0;
-}
-
-/**
- * try_offline_node
- *
- * Offline a node if all memory sections and cpus of the node are removed.
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call.
- */
-void try_offline_node(int nid)
+#else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = pgdat->node_start_pfn;
- unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
- unsigned long pfn;
- struct page *pgdat_page = virt_to_page(pgdat);
- int i;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
-
- if (!present_section_nr(section_nr))
- continue;
-
- if (pfn_to_nid(pfn) != nid)
- continue;
-
- /*
- * some memory sections of this node are not removed, and we
- * can't offline node now.
- */
- return;
- }
-
- if (check_and_unmap_cpu_on_node(pgdat))
- return;
-
- /*
- * all memory/cpu of this node are removed, we can offline this
- * node now.
- */
- node_set_offline(nid);
- unregister_one_node(nid);
-
- if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
- /* node data is allocated from boot memory */
- return;
-
- /* free waittable in each zone */
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- /*
- * wait_table may be allocated from boot memory,
- * here only free if it's allocated by vmalloc.
- */
- if (is_vmalloc_addr(zone->wait_table))
- vfree(zone->wait_table);
- }
-
- /*
- * Since there is no way to guarentee the address of pgdat/zone is not
- * on stack of any kernel threads or used by other kernel objects
- * without reference counting or other symchronizing method, do not
- * reset node_data and free pgdat here. Just reset it to 0 and reuse
- * the memory when the node is online again.
- */
- memset(pgdat, 0, sizeof(*pgdat));
+ return -EINVAL;
}
-EXPORT_SYMBOL(try_offline_node);
-
-/**
- * remove_memory
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref remove_memory(int nid, u64 start, u64 size)
+int remove_memory(u64 start, u64 size)
{
- int ret;
-
- BUG_ON(check_hotplug_memory_range(start, size));
-
- mem_hotplug_begin();
-
- /*
- * All memory blocks must be offlined before removing memory. Check
- * whether all memory blocks in question are offline and trigger a BUG()
- * if this is not the case.
- */
- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- check_memblock_offlined_cb);
- if (ret)
- BUG();
-
- /* remove memmap entry */
- firmware_map_remove(start, start + size, "System RAM");
-
- arch_remove_memory(start, size);
-
- try_offline_node(nid);
-
- mem_hotplug_done();
+ return -EINVAL;
}
-EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
+EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f4..e2df1c1fb41 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
- * As a special case NUMA_NO_NODE here means do the allocation
+ * As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
@@ -65,8 +65,6 @@
kernel is not always grateful with that.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -93,7 +91,6 @@
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
-#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -126,19 +123,16 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
static struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
+ int node;
if (!pol) {
- int node = numa_node_id();
-
- if (node != NUMA_NO_NODE) {
+ node = numa_node_id();
+ if (node != -1)
pol = &preferred_node_policy[node];
- /*
- * preferred_node_policy is not initialised early in
- * boot
- */
- if (!pol->mode)
- pol = NULL;
- }
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
}
return pol;
@@ -167,7 +161,19 @@ static const struct mempolicy_operations {
/* Check that the nodemask contains at least one populated zone */
static int is_valid_nodemask(const nodemask_t *nodemask)
{
- return nodes_intersects(*nodemask, node_states[N_MEMORY]);
+ int nd, k;
+
+ for_each_node_mask(nd, *nodemask) {
+ struct zone *z;
+
+ for (k = 0; k <= policy_zone; k++) {
+ z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ return 1;
+ }
+ }
+
+ return 0;
}
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -264,7 +270,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
- mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
@@ -479,11 +485,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
-/*
- * Scan through pages checking if pages follow certain conditions,
- * and move them to the pagelist if they do.
- */
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+/* Scan through pages checking if pages follow certain conditions. */
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -505,8 +508,9 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
+ * And we cannot move PageKsm pages sensibly or safely yet.
*/
- if (PageReserved(page))
+ if (PageReserved(page) || PageKsm(page))
continue;
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -521,36 +525,7 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return addr != end;
}
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
- pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
-#ifdef CONFIG_HUGETLB_PAGE
- int nid;
- struct page *page;
- spinlock_t *ptl;
- pte_t entry;
-
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- entry = huge_ptep_get((pte_t *)pmd);
- if (!pte_present(entry))
- goto unlock;
- page = pte_page(entry);
- nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
- goto unlock;
- /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
- if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, private);
-unlock:
- spin_unlock(ptl);
-#else
- BUG();
-#endif
-}
-
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -561,24 +536,17 @@ static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (!pmd_present(*pmd))
- continue;
- if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
- queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
- flags, private);
- continue;
- }
split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
+ if (check_pte_range(vma, pmd, addr, next, nodes,
flags, private))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -589,18 +557,16 @@ static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
- continue;
if (pud_none_or_clear_bad(pud))
continue;
- if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
+ if (check_pmd_range(vma, pud, addr, next, nodes,
flags, private))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
+static inline int check_pgd_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -613,14 +579,14 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
+ if (check_pud_range(vma, pgd, addr, next, nodes,
flags, private))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
}
-#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
@@ -634,6 +600,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
if (nr_updated)
@@ -647,27 +614,26 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
{
return 0;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
/*
- * Walk through page tables and collect pages to be migrated.
- *
- * If pages found in a given range are on a set of nodes (determined by
- * @nodes and @flags,) it's isolated and queued to the pagelist which is
- * passed via @private.)
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
*/
-static int
-queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+static struct vm_area_struct *
+check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
- int err = 0;
- struct vm_area_struct *vma, *prev;
+ int err;
+ struct vm_area_struct *first, *vma, *prev;
- vma = find_vma(mm, start);
- if (!vma)
- return -EFAULT;
+
+ first = find_vma(mm, start);
+ if (!first)
+ return ERR_PTR(-EFAULT);
prev = NULL;
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
@@ -677,11 +643,14 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
}
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
if (flags & MPOL_MF_LAZY) {
change_prot_numa(vma, start, endvma);
goto next;
@@ -691,15 +660,17 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma))) {
- err = queue_pages_pgd_range(vma, start, endvma, nodes,
+ err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
- if (err)
+ if (err) {
+ first = ERR_PTR(err);
break;
+ }
}
next:
prev = vma;
}
- return err;
+ return first;
}
/*
@@ -774,10 +745,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (prev) {
vma = prev;
next = vma->vm_next;
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
- /* vma_merge() joined vma && vma->next, case 8 */
- goto replace;
+ continue;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
@@ -789,7 +757,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
- replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
@@ -799,6 +766,36 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
return err;
}
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+ mpol_fix_fork_child_flag(current);
+}
+
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
@@ -835,6 +832,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
+ mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
@@ -1001,11 +999,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1028,14 +1022,15 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, new_node_page, dest,
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
- putback_movable_pages(&pagelist);
+ putback_lru_pages(&pagelist);
}
return err;
@@ -1098,7 +1093,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
- int source = NUMA_NO_NODE;
+ int source = -1;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1133,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
- if (source == NUMA_NO_NODE)
+ if (source == -1)
break;
node_clear(source, tmp);
@@ -1153,17 +1148,16 @@ out:
/*
* Allocate a new page for page migration based on vma policy.
- * Start by assuming the page is mapped by the same vma as contains @start.
+ * Start assuming that page is mapped by vma pointed to by @private.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
unsigned long uninitialized_var(address);
- vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
@@ -1171,10 +1165,6 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
vma = vma->vm_next;
}
- if (PageHuge(page)) {
- BUG_ON(!vma);
- return alloc_huge_page_noerr(vma, address, 1);
- }
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
@@ -1193,7 +1183,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_page(struct page *page, unsigned long start, int **x)
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
{
return NULL;
}
@@ -1203,6 +1193,7 @@ static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
+ struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
@@ -1244,7 +1235,7 @@ static long do_mbind(unsigned long start, unsigned long len,
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
- nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
+ nmask ? nodes_addr(*nmask)[0] : -1);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
@@ -1268,9 +1259,11 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- err = queue_pages_range(mm, start, end, nmask,
+ vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- if (!err)
+
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
if (!err) {
@@ -1278,16 +1271,18 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_page, NULL,
- start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma,
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
- putback_movable_pages(&pagelist);
+ putback_lru_pages(&pagelist);
}
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
- putback_movable_pages(&pagelist);
+ putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_out:
@@ -1363,7 +1358,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
- unsigned long, mode, const unsigned long __user *, nmask,
+ unsigned long, mode, unsigned long __user *, nmask,
unsigned long, maxnode, unsigned, flags)
{
nodemask_t nodes;
@@ -1384,7 +1379,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
}
/* Set the process memory policy */
-SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
+SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
unsigned long, maxnode)
{
int err;
@@ -1526,10 +1521,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
- compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode,
- compat_ulong_t, addr, compat_ulong_t, flags)
+asmlinkage long compat_sys_get_mempolicy(int __user *policy,
+ compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode,
+ compat_ulong_t addr, compat_ulong_t flags)
{
long err;
unsigned long __user *nm = NULL;
@@ -1556,8 +1551,8 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
return err;
}
-COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode)
+asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1579,9 +1574,9 @@ COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
-COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
- compat_ulong_t, mode, compat_ulong_t __user *, nmask,
- compat_ulong_t, maxnode, compat_ulong_t, flags)
+asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
+ compat_ulong_t mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode, compat_ulong_t flags)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1607,9 +1602,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
/*
* get_vma_policy(@task, @vma, @addr)
- * @task: task for fallback if vma policy == default
- * @vma: virtual memory area whose policy is sought
- * @addr: address in @vma for shared policy lookup
+ * @task - task for fallback if vma policy == default
+ * @vma - virtual memory area whose policy is sought
+ * @addr - address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
@@ -1649,50 +1644,6 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
-{
- struct mempolicy *pol = get_task_policy(task);
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
-
- pol = vma->vm_ops->get_policy(vma, vma->vm_start);
- if (pol && (pol->flags & MPOL_F_MOF))
- ret = true;
- mpol_cond_put(pol);
-
- return ret;
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
- }
- }
-
- if (!pol)
- return default_policy.flags & MPOL_F_MOF;
-
- return pol->flags & MPOL_F_MOF;
-}
-
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
-{
- enum zone_type dynamic_policy_zone = policy_zone;
-
- BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
-
- /*
- * if policy->v.nodes has movable memory only,
- * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
- *
- * policy->v.nodes is intersect with node_states[N_MEMORY].
- * so if the following test faile, it implies
- * policy->v.nodes has movable memory only.
- */
- if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
- dynamic_policy_zone = ZONE_MOVABLE;
-
- return zone >= dynamic_policy_zone;
-}
-
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
@@ -1701,7 +1652,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
- apply_policy_zone(policy, gfp_zone(gfp)) &&
+ gfp_zone(gfp) >= policy_zone &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
return &policy->v.nodes;
@@ -1752,18 +1703,21 @@ static unsigned interleave_nodes(struct mempolicy *policy)
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
+ * @policy must be protected by freeing by the caller. If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy. The system default policy requires no
+ * such protection.
*/
-unsigned int mempolicy_slab_node(void)
+unsigned slab_node(void)
{
struct mempolicy *policy;
- int node = numa_mem_id();
if (in_interrupt())
- return node;
+ return numa_node_id();
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
- return node;
+ return numa_node_id();
switch (policy->mode) {
case MPOL_PREFERRED:
@@ -1783,11 +1737,11 @@ unsigned int mempolicy_slab_node(void)
struct zonelist *zonelist;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(node)->node_zonelists[0];
+ zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone ? zone->node : node;
+ return zone ? zone->node : numa_node_id();
}
default:
@@ -1802,7 +1756,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int c;
- int nid = NUMA_NO_NODE;
+ int nid = -1;
if (!nnodes)
return numa_node_id();
@@ -1839,11 +1793,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
/*
* Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
+ * (returns -1 if nodemask is empty)
*/
int node_random(const nodemask_t *maskp)
{
- int w, bit = NUMA_NO_NODE;
+ int w, bit = -1;
w = nodes_weight(*maskp);
if (w)
@@ -1855,18 +1809,18 @@ int node_random(const nodemask_t *maskp)
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
- * @vma: virtual memory area whose policy is sought
- * @addr: address in @vma for shared policy lookup and interleave policy
- * @gfp_flags: for requested zone
- * @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
*
* Returns a zonelist suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
- * Must be protected by read_mems_allowed_begin()
+ * Must be protected by get_mems_allowed()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2030,7 +1984,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
- cpuset_mems_cookie = read_mems_allowed_begin();
+ cpuset_mems_cookie = get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -2038,7 +1992,7 @@ retry_cpuset:
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
@@ -2048,7 +2002,7 @@ retry_cpuset:
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
@@ -2082,7 +2036,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
pol = &default_policy;
retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
+ cpuset_mems_cookie = get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
@@ -2095,23 +2049,13 @@ retry_cpuset:
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
-int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
-{
- struct mempolicy *pol = mpol_dup(vma_policy(src));
-
- if (IS_ERR(pol))
- return PTR_ERR(pol);
- dst->vm_policy = pol;
- return 0;
-}
-
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
@@ -2139,6 +2083,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
+ rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
@@ -2146,6 +2091,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
+ rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
@@ -2269,9 +2215,9 @@ static void sp_free(struct sp_node *n)
/**
* mpol_misplaced - check whether current page node is valid in policy
*
- * @page: page to be checked
- * @vma: vm area where page mapped
- * @addr: virtual address where page mapped
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
@@ -2289,8 +2235,6 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
- int thiscpu = raw_smp_processor_id();
- int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
@@ -2339,9 +2283,33 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- polnid = thisnid;
+ int last_nid;
+
+ polnid = numa_node_id();
- if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ /*
+ * Multi-stage node selection is used in conjunction
+ * with a periodic migration fault to build a temporal
+ * task<->page relation. By using a two-stage filter we
+ * remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist
+ * probability, we can equate a task's usage of a
+ * particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and
+ * getting the same result twice in a row, given these
+ * samples are fully independent, is then given by
+ * P(n)^2, provided our sample period is sufficiently
+ * short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making
+ * it less likely we act on an unlikely task<->page
+ * relation.
+ */
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
goto out;
}
@@ -2417,9 +2385,9 @@ restart:
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
- sp_node_init(n_new, end, n->end, mpol_new);
- n->end = start;
+ sp_node_init(n_new, n->end, end, mpol_new);
sp_insert(sp, n_new);
+ n->end = start;
n_new = NULL;
mpol_new = NULL;
break;
@@ -2515,7 +2483,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2547,7 +2515,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
}
#ifdef CONFIG_NUMA_BALANCING
-static int __initdata numabalancing_override;
+static bool __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
@@ -2556,15 +2524,9 @@ static void __init check_numabalancing_enable(void)
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
- /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
- if (numabalancing_override)
- set_numabalancing_state(numabalancing_override == 1);
-
if (nr_node_ids > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
- numabalancing_default ? "Enabling" : "Disabling");
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
+ "Configure with numa_balancing= or sysctl");
set_numabalancing_state(numabalancing_default);
}
}
@@ -2574,17 +2536,18 @@ static int __init setup_numabalancing(char *str)
int ret = 0;
if (!str)
goto out;
+ numabalancing_override = true;
if (!strcmp(str, "enable")) {
- numabalancing_override = 1;
+ set_numabalancing_state(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
- numabalancing_override = -1;
+ set_numabalancing_state(false);
ret = 1;
}
out:
if (!ret)
- pr_warn("Unable to parse numa_balancing=\n");
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
return ret;
}
@@ -2644,7 +2607,7 @@ void __init numa_policy_init(void)
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
- pr_err("%s: interleaving failed\n", __func__);
+ printk("numa_policy_init: interleaving failed\n");
check_numabalancing_enable();
}
@@ -2812,45 +2775,62 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
- * Convert @pol into a string. If @buffer is too short, truncate the string.
- * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
- * longest flag, "relative", and to display at least a few node ids.
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
*/
-void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
- nodemask_t nodes = NODE_MASK_NONE;
- unsigned short mode = MPOL_DEFAULT;
- unsigned short flags = 0;
+ int l;
+ nodemask_t nodes;
+ unsigned short mode;
+ unsigned short flags = pol ? pol->flags : 0;
- if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
+ /*
+ * Sanity check: room for longest mode, flag and some nodes
+ */
+ VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
mode = pol->mode;
- flags = pol->flags;
- }
switch (mode) {
case MPOL_DEFAULT:
+ nodes_clear(nodes);
break;
+
case MPOL_PREFERRED:
+ nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
+
case MPOL_BIND:
+ /* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
+
default:
- WARN_ON_ONCE(1);
- snprintf(p, maxlen, "unknown");
- return;
+ return -EINVAL;
}
- p += snprintf(p, maxlen, "%s", policy_modes[mode]);
+ l = strlen(policy_modes[mode]);
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
+
+ strcpy(p, policy_modes[mode]);
+ p += l;
if (flags & MPOL_MODE_FLAGS) {
- p += snprintf(p, buffer + maxlen - p, "=");
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = '=';
/*
* Currently, the only defined flags are mutually exclusive
@@ -2862,7 +2842,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
}
if (!nodes_empty(nodes)) {
- p += snprintf(p, buffer + maxlen - p, ":");
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = ':';
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
+ return p - buffer;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c720..54990476c04 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
@@ -74,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
- pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
+ pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
if (!pool)
return NULL;
pool->elements = kmalloc_node(min_nr * sizeof(void *),
@@ -193,7 +192,6 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
*/
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
@@ -202,7 +200,6 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait;
gfp_t gfp_temp;
- VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_WAIT);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
@@ -223,11 +220,6 @@ repeat_alloc:
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
- /*
- * Update the allocation stack trace as this is more useful
- * for debugging.
- */
- kmemleak_update_trace(element);
return element;
}
@@ -312,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (unlikely(pool->curr_nr < pool->min_nr)) {
+ if (pool->curr_nr < pool->min_nr) {
spin_lock_irqsave(&pool->lock, flags);
- if (likely(pool->curr_nr < pool->min_nr)) {
+ if (pool->curr_nr < pool->min_nr) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e0beaa9184..c38778610aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,7 +36,6 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/balloon_compaction.h>
-#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
@@ -72,12 +71,28 @@ int migrate_prep_local(void)
}
/*
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking evictable pages back onto unevictable list.
+ */
+void putback_lru_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+}
+
+/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
*
- * This function shall be used whenever the isolated pageset has been
- * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
*/
void putback_movable_pages(struct list_head *l)
{
@@ -85,14 +100,10 @@ void putback_movable_pages(struct list_head *l)
struct page *page2;
list_for_each_entry_safe(page, page2, l, lru) {
- if (unlikely(PageHuge(page))) {
- putback_active_hugepage(page);
- continue;
- }
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(isolated_balloon_page(page)))
+ if (unlikely(balloon_page_movable(page)))
balloon_page_putback(page);
else
putback_lru_page(page);
@@ -115,11 +126,13 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, addr);
if (!ptep)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
+ ptl = &mm->page_table_lock;
} else {
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ if (pmd_trans_huge(*pmd))
+ goto out;
ptep = pte_offset_map(pmd, addr);
@@ -144,17 +157,13 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
- if (pte_swp_soft_dirty(*ptep))
- pte = pte_mksoft_dirty(pte);
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(new)) {
+ if (PageHuge(new))
pte = pte_mkhuge(pte);
- pte = arch_make_huge_pte(pte, vma, new, 0);
- }
#endif
- flush_dcache_page(new);
+ flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
if (PageHuge(new)) {
@@ -176,49 +185,12 @@ out:
}
/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it. Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
-{
- struct vm_area_struct *vma;
- /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long addr;
-
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr >= vma->vm_start && addr < vma->vm_end)
- remove_migration_pte(page, vma, addr, arg);
- }
- return SWAP_AGAIN;
-}
-
-/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- struct rmap_walk_control rwc = {
- .rmap_one = remove_migration_pte,
- .arg = old,
- .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
- };
-
- rmap_walk(new, &rwc);
+ rmap_walk(new, remove_migration_pte, old);
}
/*
@@ -226,14 +198,15 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
- spinlock_t *ptl)
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
{
- pte_t pte;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
swp_entry_t entry;
struct page *page;
- spin_lock(ptl);
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
@@ -261,21 +234,6 @@ out:
pte_unmap_unlock(ptep, ptl);
}
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
-{
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- pte_t *ptep = pte_offset_map(pmd, address);
- __migration_entry_wait(mm, ptep, ptl);
-}
-
-void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte)
-{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
- __migration_entry_wait(mm, pte, ptl);
-}
-
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -334,17 +292,16 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
-int migrate_page_move_mapping(struct address_space *mapping,
+static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode,
- int extra_count)
+ struct buffer_head *head, enum migrate_mode mode)
{
- int expected_count = 1 + extra_count;
+ int expected_count = 0;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
- if (page_count(page) != expected_count)
+ if (page_count(page) != 1)
return -EAGAIN;
return MIGRATEPAGE_SUCCESS;
}
@@ -354,7 +311,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
- expected_count += 1 + page_has_private(page);
+ expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
@@ -463,60 +420,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
}
/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page. We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
- int nr_pages)
-{
- int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < nr_pages; ) {
- cond_resched();
- copy_highpage(dst, src);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-static void copy_huge_page(struct page *dst, struct page *src)
-{
- int i;
- int nr_pages;
-
- if (PageHuge(src)) {
- /* hugetlbfs page */
- struct hstate *h = page_hstate(src);
- nr_pages = pages_per_huge_page(h);
-
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
- __copy_gigantic_page(dst, src, nr_pages);
- return;
- }
- } else {
- /* thp page */
- BUG_ON(!PageTransHuge(src));
- nr_pages = hpage_nr_pages(src);
- }
-
- for (i = 0; i < nr_pages; i++) {
- cond_resched();
- copy_highpage(dst + i, src + i);
- }
-}
-
-/*
* Copy the page to its new location
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- int cpupid;
-
if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
@@ -529,7 +436,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageUptodate(page))
SetPageUptodate(newpage);
if (TestClearPageActive(page)) {
- VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON(PageUnevictable(page));
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
@@ -553,19 +460,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
- /*
- * Copy NUMA information to the new page, to prevent over-eager
- * future migrations of this same page.
- */
- cpupid = page_cpupid_xchg_last(page, -1);
- page_cpupid_xchg_last(newpage, cpupid);
-
mlock_migrate_page(newpage, page);
ksm_migrate_page(newpage, page);
- /*
- * Please do not reorder this without considering how mm/ksm.c's
- * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
- */
+
ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -582,6 +479,14 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Migration functions
***********************************************************/
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
/*
* Common logic to directly migrate a single page suitable for
* pages that do not use PagePrivate/PagePrivate2.
@@ -596,7 +501,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -623,7 +528,7 @@ int buffer_migrate_page(struct address_space *mapping,
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -791,7 +696,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
}
static int __unmap_and_move(struct page *page, struct page *newpage,
- int force, enum migrate_mode mode)
+ int force, bool offlining, enum migrate_mode mode)
{
int rc = -EAGAIN;
int remap_swapcache = 1;
@@ -821,12 +726,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
lock_page(page);
}
+ /*
+ * Only memory hotplug's offline_pages() caller has locked out KSM,
+ * and can safely migrate a KSM page. The other cases have skipped
+ * PageKsm along with PageReserved - but it is only now when we have
+ * the page lock that we can be certain it will not go KSM beneath us
+ * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
+ * its pagecount raised, but only here do we take the page lock which
+ * serializes that).
+ */
+ if (PageKsm(page) && !offlining) {
+ rc = -EBUSY;
+ goto unlock;
+ }
+
/* charge against new page */
mem_cgroup_prepare_migration(page, newpage, &mem);
if (PageWriteback(page)) {
/*
- * Only in the case of a full synchronous migration is it
+ * Only in the case of a full syncronous migration is it
* necessary to wait for PageWriteback. In the async case,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
@@ -847,7 +766,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
- if (PageAnon(page) && !PageKsm(page)) {
+ if (PageAnon(page)) {
/*
* Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
@@ -901,7 +820,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- VM_BUG_ON_PAGE(PageAnon(page), page);
+ VM_BUG_ON(PageAnon(page));
if (page_has_private(page)) {
try_to_free_buffers(page);
goto uncharge;
@@ -927,6 +846,7 @@ uncharge:
mem_cgroup_end_migration(mem, page, newpage,
(rc == MIGRATEPAGE_SUCCESS ||
rc == MIGRATEPAGE_BALLOON_SUCCESS));
+unlock:
unlock_page(page);
out:
return rc;
@@ -936,8 +856,8 @@ out:
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct page *page, int force,
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining,
enum migrate_mode mode)
{
int rc = 0;
@@ -956,7 +876,7 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
if (unlikely(split_huge_page(page)))
goto out;
- rc = __unmap_and_move(page, newpage, force, mode);
+ rc = __unmap_and_move(page, newpage, force, offlining, mode);
if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
/*
@@ -982,17 +902,11 @@ out:
page_is_file_cache(page));
putback_lru_page(page);
}
-
/*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, putback_lru_page() will drop the reference grabbed
- * during isolation.
+ * Move the new page to the LRU. If migration was not successful
+ * then this will free the page.
*/
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
- put_new_page(newpage, private);
- else
- putback_lru_page(newpage);
-
+ putback_lru_page(newpage);
if (result) {
if (rc)
*result = rc;
@@ -1021,28 +935,15 @@ out:
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
- struct page *hpage, int force,
+ unsigned long private, struct page *hpage,
+ int force, bool offlining,
enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
- struct page *new_hpage;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
struct anon_vma *anon_vma = NULL;
- /*
- * Movability of hugepages depends on architectures and hugepage size.
- * This check is necessary because some callers of hugepage migration
- * like soft offline and memory hotremove don't walk through page
- * tables or check whether the hugepage is pmd-based or not before
- * kicking migration.
- */
- if (!hugepage_migration_supported(page_hstate(hpage))) {
- putback_active_hugepage(hpage);
- return -ENOSYS;
- }
-
- new_hpage = get_new_page(hpage, private, &result);
if (!new_hpage)
return -ENOMEM;
@@ -1062,30 +963,18 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1, mode);
- if (rc != MIGRATEPAGE_SUCCESS)
+ if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
put_anon_vma(anon_vma);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (!rc)
hugetlb_cgroup_migrate(hpage, new_hpage);
unlock_page(hpage);
out:
- if (rc != -EAGAIN)
- putback_active_hugepage(hpage);
-
- /*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, put_page() will drop the reference grabbed during
- * isolation.
- */
- if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
- put_new_page(new_hpage, private);
- else
- put_page(new_hpage);
-
+ put_page(new_hpage);
if (result) {
if (rc)
*result = rc;
@@ -1096,28 +985,22 @@ out:
}
/*
- * migrate_pages - migrate the pages specified in a list, to the free pages
- * supplied as the target for the page migration
+ * migrate_pages
*
- * @from: The list of pages to be migrated.
- * @get_new_page: The function used to allocate free pages to be used
- * as the target of the page migration.
- * @put_new_page: The function used to free target pages if migration
- * fails, or NULL if no special handling is necessary.
- * @private: Private data to be passed on to get_new_page()
- * @mode: The migration mode that specifies the constraints for
- * page migration, if any.
- * @reason: The reason for page migration.
+ * The function takes one list of pages to migrate and a function
+ * that determines from the page to be migrated and the private data
+ * the target of the move and allocates the page.
*
- * The function returns after 10 attempts or if no pages are movable any more
- * because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ * Caller should call putback_lru_pages to return pages to the LRU
* or free list only if ret != 0.
*
- * Returns the number of pages that were not migrated, or an error code.
+ * Return: Number of pages not migrated or error code.
*/
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
- free_page_t put_new_page, unsigned long private,
+int migrate_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private, bool offlining,
enum migrate_mode mode, int reason)
{
int retry = 1;
@@ -1138,13 +1021,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
- if (PageHuge(page))
- rc = unmap_and_move_huge_page(get_new_page,
- put_new_page, private, page,
- pass > 2, mode);
- else
- rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode);
+ rc = unmap_and_move(get_new_page, private,
+ page, pass > 2, offlining,
+ mode);
switch(rc) {
case -ENOMEM:
@@ -1156,12 +1035,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
nr_succeeded++;
break;
default:
- /*
- * Permanent failure (-EBUSY, -ENOSYS, etc.):
- * unlike -EAGAIN case, the failed page is
- * removed from migration page list and not
- * retried in the next outer loop.
- */
+ /* Permanent failure */
nr_failed++;
break;
}
@@ -1181,6 +1055,34 @@ out:
return rc;
}
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
+ unsigned long private, bool offlining,
+ enum migrate_mode mode)
+{
+ int pass, rc;
+
+ for (pass = 0; pass < 10; pass++) {
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, hpage, pass > 2, offlining,
+ mode);
+ switch (rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ /* try again */
+ cond_resched();
+ break;
+ case MIGRATEPAGE_SUCCESS:
+ goto out;
+ default:
+ rc = -EIO;
+ goto out;
+ }
+ }
+out:
+ return rc;
+}
+
#ifdef CONFIG_NUMA
/*
* Move a list of individual pages
@@ -1205,12 +1107,8 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
- if (PageHuge(p))
- return alloc_huge_page_node(page_hstate(compound_head(p)),
- pm->node);
- else
- return alloc_pages_exact_node(pm->node,
- GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
+ return alloc_pages_exact_node(pm->node,
+ GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
}
/*
@@ -1252,7 +1150,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto set_status;
/* Use PageReserved to check for zero page */
- if (PageReserved(page))
+ if (PageReserved(page) || PageKsm(page))
goto put_and_set;
pp->page = page;
@@ -1269,11 +1167,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
!migrate_all)
goto put_and_set;
- if (PageHuge(page)) {
- isolate_huge_page(page, &pagelist);
- goto put_and_set;
- }
-
err = isolate_lru_page(page);
if (!err) {
list_add_tail(&page->lru, &pagelist);
@@ -1293,10 +1186,11 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node, NULL,
- (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, new_page_node,
+ (unsigned long)pm, 0, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
- putback_movable_pages(&pagelist);
+ putback_lru_pages(&pagelist);
}
up_read(&mm->mmap_sem);
@@ -1418,7 +1312,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
err = -ENOENT;
/* Use PageReserved to check for zero page */
- if (!page || PageReserved(page))
+ if (!page || PageReserved(page) || PageKsm(page))
goto set_status;
err = page_to_nid(page);
@@ -1565,7 +1459,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
* pages. Currently it only checks the watermarks which crude
*/
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
- unsigned long nr_migrate_pages)
+ int nr_migrate_pages)
{
int z;
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1574,7 +1468,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
if (!populated_zone(zone))
continue;
- if (!zone_reclaimable(zone))
+ if (zone->all_unreclaimable)
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -1596,10 +1490,12 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
struct page *newpage;
newpage = alloc_pages_exact_node(nid,
- (GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE | __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_NOWARN) &
+ (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN) &
~GFP_IOFS, 0);
+ if (newpage)
+ page_xchg_last_nid(newpage, page_last_nid(page));
return newpage;
}
@@ -1633,85 +1529,65 @@ bool migrate_ratelimited(int node)
}
/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
- unsigned long nr_pages)
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
{
+ bool rate_limited = false;
+
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
+ spin_lock(&pgdat->numabalancing_migrate_lock);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
- spin_lock(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
- spin_unlock(&pgdat->numabalancing_migrate_lock);
- }
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
- trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
- nr_pages);
- return true;
}
-
- /*
- * This is an unlocked non-atomic update so errors are possible.
- * The consequences are failing to migrate when we potentiall should
- * have which is not severe enough to warrant locking. If it is ever
- * a problem, it can be converted to a per-cpu counter.
- */
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- return false;
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+ rate_limited = true;
+ else
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+
+ return rate_limited;
}
-static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
-
- VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ int ret = 0;
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
- return 0;
+ if (migrate_balanced_pgdat(pgdat, 1)) {
+ int page_lru;
- if (isolate_lru_page(page))
- return 0;
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ return 0;
+ }
- /*
- * migrate_misplaced_transhuge_page() skips page migration's usual
- * check on page_count(), so we must do it here, now that the page
- * has been isolated: a GUP pin, or any other pin, prevents migration.
- * The expected page count is 3: 1 for page's mapcount and 1 for the
- * caller's pin and 1 for the reference taken by isolate_lru_page().
- */
- if (PageTransHuge(page) && page_count(page) != 3) {
- putback_lru_page(page);
- return 0;
+ /* Page is isolated */
+ ret = 1;
+ page_lru = page_is_file_cache(page);
+ if (!PageTransHuge(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+ else
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ HPAGE_PMD_NR);
}
- page_lru = page_is_file_cache(page);
- mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
-
/*
- * Isolating the page has taken another reference, so the
- * caller's reference can be safely dropped without the page
- * disappearing underneath us during migration.
+ * Page is either isolated or there is not enough space on the target
+ * node. If isolated, then it has taken a reference count and the
+ * callers reference can be safely dropped without the page
+ * disappearing underneath us during migration. Otherwise the page is
+ * not to be migrated but the callers reference should still be
+ * dropped so it does not leak.
*/
put_page(page);
- return 1;
-}
-bool pmd_trans_migrating(pmd_t pmd)
-{
- struct page *page = pmd_page(pmd);
- return PageLocked(page);
-}
-
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
- struct page *page = pmd_page(*pmd);
- wait_on_page_locked(page);
+ return ret;
}
/*
@@ -1719,77 +1595,72 @@ void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_page(struct page *page, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
- int isolated;
+ int isolated = 0;
int nr_remaining;
LIST_HEAD(migratepages);
/*
- * Don't migrate file pages that are mapped in multiple processes
- * with execute permissions as they are probably shared libraries.
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
- (vma->vm_flags & VM_EXEC))
+ if (page_mapcount(page) != 1) {
+ put_page(page);
goto out;
+ }
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
- if (numamigrate_update_ratelimit(pgdat, 1))
+ if (numamigrate_update_ratelimit(pgdat, 1)) {
+ put_page(page);
goto out;
+ }
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- NULL, node, MIGRATE_ASYNC,
- MR_NUMA_MISPLACED);
+ nr_remaining = migrate_pages(&migratepages,
+ alloc_misplaced_dst_page,
+ node, false, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
if (nr_remaining) {
- if (!list_empty(&migratepages)) {
- list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
+ putback_lru_pages(&migratepages);
isolated = 0;
} else
count_vm_numa_event(NUMA_PAGE_MIGRATE);
BUG_ON(!list_empty(&migratepages));
- return isolated;
-
out:
- put_page(page);
- return 0;
+ return isolated;
}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct vm_area_struct *vma,
pmd_t *pmd, pmd_t entry,
unsigned long address,
struct page *page, int node)
{
- spinlock_t *ptl;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
- pmd_t orig_entry;
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
/*
* Rate-limit the amount of data that is being migrated to a node.
@@ -1800,20 +1671,32 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
- HPAGE_PMD_ORDER);
- if (!new_page)
- goto out_fail;
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out_dropref;
+ }
+ page_xchg_last_nid(new_page, page_last_nid(page));
isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated) {
+
+ /*
+ * Failing to isolate or a GUP pin prevents migration. The expected
+ * page count is 2. 1 for anonymous pages without a mapping and 1
+ * for the callers pin. If the page was isolated, the page will
+ * need to be put back on the LRU.
+ */
+ if (!isolated || page_count(page) != 2) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
put_page(new_page);
- goto out_fail;
+ if (isolated) {
+ putback_lru_page(page);
+ isolated = 0;
+ goto out;
+ }
+ goto out_keep_locked;
}
- if (mm_tlb_flush_pending(mm))
- flush_tlb_range(vma, mmun_start, mmun_end);
-
/* Prepare a page as a migration target */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
@@ -1825,12 +1708,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
-fail_putback:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1842,13 +1722,11 @@ fail_putback:
unlock_page(new_page);
put_page(new_page); /* Free it */
- /* Retake the callers reference and putback on LRU */
- get_page(page);
+ unlock_page(page);
putback_lru_page(page);
- mod_zone_page_state(page_zone(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
- goto out_unlock;
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out;
}
/*
@@ -1860,47 +1738,23 @@ fail_putback:
*/
mem_cgroup_prepare_migration(page, new_page, &memcg);
- orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
+ entry = pmd_mknonnuma(entry);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
- /*
- * Clear the old entry under pagetable lock and establish the new PTE.
- * Any parallel GUP will either observe the old page blocking on the
- * page lock, block on the page table lock or observe the new page.
- * The SetPageUptodate on the new page and page_add_new_anon_rmap
- * guarantee the copy is visible before the pagetable update.
- */
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
- pmdp_clear_flush(vma, mmun_start, pmd);
- set_pmd_at(mm, mmun_start, pmd, entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
- update_mmu_cache_pmd(vma, address, &entry);
-
- if (page_count(page) != 2) {
- set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
- update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
- goto fail_putback;
- }
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page);
-
/*
* Finish the charge transaction under the page table lock to
* prevent split_huge_page() from dividing up the charge
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- /* Take an "isolate" reference and put new page on the LRU. */
- get_page(new_page);
- putback_lru_page(new_page);
+ spin_unlock(&mm->page_table_lock);
unlock_page(new_page);
unlock_page(page);
@@ -1910,25 +1764,15 @@ fail_putback:
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+out:
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
return isolated;
-out_fail:
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
- ptl = pmd_lock(mm, pmd);
- if (pmd_same(*pmd, entry)) {
- entry = pmd_mknonnuma(entry);
- set_pmd_at(mm, mmun_start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
- }
- spin_unlock(ptl);
-
-out_unlock:
- unlock_page(page);
put_page(page);
+out_keep_locked:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mincore.c b/mm/mincore.c
index 725c8096104..936b4cee8cb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -70,21 +70,13 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
-#ifdef CONFIG_SWAP
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- /*
- * shmem/tmpfs may return swap: account for swapcache
- * page too.
- */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp), swp.val);
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may return swap: account for swapcache page too. */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swap = radix_to_swp_entry(page);
+ page = find_get_page(&swapper_space, swap.val);
+ }
#endif
if (page) {
present = PageUptodate(page);
@@ -143,8 +135,7 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
} else {
#ifdef CONFIG_SWAP
pgoff = entry.val;
- *vec = mincore_page(swap_address_space(entry),
- pgoff);
+ *vec = mincore_page(&swapper_space, pgoff);
#else
WARN_ON(1);
*vec = 1;
@@ -233,6 +224,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (is_vm_hugetlb_page(vma)) {
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ return (end - addr) >> PAGE_SHIFT;
+ }
+
+ end = pmd_addr_end(addr, end);
+
if (is_vm_hugetlb_page(vma))
mincore_hugetlb_page_range(vma, addr, end, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index b1eb5363400..f0b9ce572fc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -11,7 +11,6 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
-#include <linux/pagevec.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
@@ -19,8 +18,6 @@
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
-#include <linux/memcontrol.h>
-#include <linux/mm_inline.h>
#include "internal.h"
@@ -79,7 +76,6 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
- /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -91,73 +87,9 @@ void mlock_vma_page(struct page *page)
}
}
-/*
- * Isolate a page from LRU with optional get_page() pin.
- * Assumes lru_lock already held and page already pinned.
- */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
-{
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
- if (getpage)
- get_page(page);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- return true;
- }
-
- return false;
-}
-
-/*
- * Finish munlock after successful page isolation
- *
- * Page must be locked. This is a wrapper for try_to_munlock()
- * and putback_lru_page() with munlock accounting.
- */
-static void __munlock_isolated_page(struct page *page)
-{
- int ret = SWAP_AGAIN;
-
- /*
- * Optimization: if the page was mapped just once, that's our mapping
- * and we don't need to check all the other vmas.
- */
- if (page_mapcount(page) > 1)
- ret = try_to_munlock(page);
-
- /* Did try_to_unlock() succeed or punt? */
- if (ret != SWAP_MLOCK)
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-
- putback_lru_page(page);
-}
-
-/*
- * Accounting for page isolation fail during munlock
- *
- * Performs accounting when page isolation fails in munlock. There is nothing
- * else to do because it means some other task has already removed the page
- * from the LRU. putback_lru_page() will take care of removing the page from
- * the unevictable list, if necessary. vmscan [page_referenced()] will move
- * the page back to the unevictable list if some other vma has it mlocked.
- */
-static void __munlock_isolation_failed(struct page *page)
-{
- if (PageUnevictable(page))
- __count_vm_event(UNEVICTABLE_PGSTRANDED);
- else
- __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-}
-
/**
* munlock_vma_page - munlock a vma page
- * @page - page to be unlocked, either a normal page or THP page head
- *
- * returns the size of the page as a page mask (0 for normal page,
- * HPAGE_PMD_NR - 1 for THP head page)
+ * @page - page to be unlocked
*
* called from munlock()/munmap() path with page supposedly on the LRU.
* When we munlock a page, because the vma where we found the page is being
@@ -170,39 +102,45 @@ static void __munlock_isolation_failed(struct page *page)
* can't isolate the page, we leave it for putback_lru_page() and vmscan
* [page_referenced()/try_to_unmap()] to deal with.
*/
-unsigned int munlock_vma_page(struct page *page)
+void munlock_vma_page(struct page *page)
{
- unsigned int nr_pages;
- struct zone *zone = page_zone(page);
-
- /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
- /*
- * Serialize with any parallel __split_huge_page_refcount() which
- * might otherwise copy PageMlocked to part of the tail pages before
- * we clear it in the head page. It also stabilizes hpage_nr_pages().
- */
- spin_lock_irq(&zone->lru_lock);
-
- nr_pages = hpage_nr_pages(page);
- if (!TestClearPageMlocked(page))
- goto unlock_out;
+ if (TestClearPageMlocked(page)) {
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
+ if (!isolate_lru_page(page)) {
+ int ret = SWAP_AGAIN;
- __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+ /*
+ * Optimization: if the page was mapped just once,
+ * that's our mapping and we don't need to check all the
+ * other vmas.
+ */
+ if (page_mapcount(page) > 1)
+ ret = try_to_munlock(page);
+ /*
+ * did try_to_unlock() succeed or punt?
+ */
+ if (ret != SWAP_MLOCK)
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
- if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&zone->lru_lock);
- __munlock_isolated_page(page);
- goto out;
+ putback_lru_page(page);
+ } else {
+ /*
+ * Some other task has removed the page from the LRU.
+ * putback_lru_page() will take care of removing the
+ * page from the unevictable list, if necessary.
+ * vmscan [page_referenced()] will move the page back
+ * to the unevictable list if some other vma has it
+ * mlocked.
+ */
+ if (PageUnevictable(page))
+ count_vm_event(UNEVICTABLE_PGSTRANDED);
+ else
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ }
}
- __munlock_isolation_failed(page);
-
-unlock_out:
- spin_unlock_irq(&zone->lru_lock);
-
-out:
- return nr_pages - 1;
}
/**
@@ -217,11 +155,13 @@ out:
*
* vma->vm_mm->mmap_sem must be held for at least read.
*/
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ unsigned long addr = start;
+ int nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
@@ -246,11 +186,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;
- /*
- * We made sure addr is within a VMA, so the following will
- * not result in a stack expansion that recurses back here.
- */
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
NULL, NULL, nonblocking);
}
@@ -266,188 +202,54 @@ static int __mlock_posix_error_return(long retval)
return retval;
}
-/*
- * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end - end address [+1] in @vma to mlock
*
- * The fast path is available only for evictable pages with single mapping.
- * Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need try_to_munlock() which can fail.
- * when !page_evictable(), we need the full redo logic of putback_lru_page to
- * avoid leaving evictable page in unevictable list.
+ * For mmap()/mremap()/expansion of mlocked vma.
*
- * In case of success, @page is added to @pvec and @pgrescued is incremented
- * in case that the page was previously unevictable. @page is also unlocked.
- */
-static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
- int *pgrescued)
-{
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- if (page_mapcount(page) <= 1 && page_evictable(page)) {
- pagevec_add(pvec, page);
- if (TestClearPageUnevictable(page))
- (*pgrescued)++;
- unlock_page(page);
- return true;
- }
-
- return false;
-}
-
-/*
- * Putback multiple evictable pages to the LRU
+ * return 0 on success for "normal" vmas.
*
- * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
- * the pages might have meanwhile become unevictable but that is OK.
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
*/
-static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+long mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
{
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
+ int nr_pages = (end - start) / PAGE_SIZE;
+ BUG_ON(!(vma->vm_flags & VM_LOCKED));
+
/*
- *__pagevec_lru_add() calls release_pages() so we don't call
- * put_page() explicitly
+ * filter unlockable vmas
*/
- __pagevec_lru_add(pvec);
- count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
-}
-
-/*
- * Munlock a batch of pages from the same zone
- *
- * The work is split to two main phases. First phase clears the Mlocked flag
- * and attempts to isolate the pages, all under a single zone lru lock.
- * The second phase finishes the munlock only for pages where isolation
- * succeeded.
- *
- * Note that the pagevec may be modified during the process.
- */
-static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
-{
- int i;
- int nr = pagevec_count(pvec);
- int delta_munlocked;
- struct pagevec pvec_putback;
- int pgrescued = 0;
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ goto no_mlock;
- pagevec_init(&pvec_putback, 0);
+ if (!((vma->vm_flags & VM_DONTEXPAND) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))) {
- /* Phase 1: page isolation */
- spin_lock_irq(&zone->lru_lock);
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
+ __mlock_vma_pages_range(vma, start, end, NULL);
- if (TestClearPageMlocked(page)) {
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- if (__munlock_isolate_lru_page(page, false))
- continue;
- else
- __munlock_isolation_failed(page);
- }
-
- /*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release() would deadlock.
- */
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
- }
- delta_munlocked = -nr + pagevec_count(&pvec_putback);
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->lru_lock);
-
- /* Now we can release pins of pages that we are not munlocking */
- pagevec_release(&pvec_putback);
-
- /* Phase 2: page munlock */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (page) {
- lock_page(page);
- if (!__putback_lru_fast_prepare(page, &pvec_putback,
- &pgrescued)) {
- /*
- * Slow path. We don't want to lose the last
- * pin before unlock_page()
- */
- get_page(page); /* for putback_lru_page() */
- __munlock_isolated_page(page);
- unlock_page(page);
- put_page(page); /* from follow_page_mask() */
- }
- }
+ /* Hide errors from mmap() and other callers */
+ return 0;
}
/*
- * Phase 3: page putback for pages that qualified for the fast path
- * This will also call put_page() to return pin from follow_page_mask()
- */
- if (pagevec_count(&pvec_putback))
- __putback_lru_fast(&pvec_putback, pgrescued);
-}
-
-/*
- * Fill up pagevec for __munlock_pagevec using pte walk
- *
- * The function expects that the struct page corresponding to @start address is
- * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
- *
- * The rest of @pvec is filled by subsequent pages within the same pmd and same
- * zone, as long as the pte's are present and vm_normal_page() succeeds. These
- * pages also get pinned.
- *
- * Returns the address of the next page that should be scanned. This equals
- * @start + PAGE_SIZE when no page could be added by the pte walk.
- */
-static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- struct vm_area_struct *vma, int zoneid, unsigned long start,
- unsigned long end)
-{
- pte_t *pte;
- spinlock_t *ptl;
-
- /*
- * Initialize pte walk starting at the already pinned page where we
- * are sure that there is a pte, as it was pinned under the same
- * mmap_sem write op.
+ * User mapped kernel pages or huge pages:
+ * make these pages present to populate the ptes, but
+ * fall thru' to reset VM_LOCKED--no need to unlock, and
+ * return nr_pages so these don't get counted against task's
+ * locked limit. huge pages are already counted against
+ * locked vm limit.
*/
- pte = get_locked_pte(vma->vm_mm, start, &ptl);
- /* Make sure we do not cross the page table boundary */
- end = pgd_addr_end(start, end);
- end = pud_addr_end(start, end);
- end = pmd_addr_end(start, end);
-
- /* The page next to the pinned page is the first we will try to get */
- start += PAGE_SIZE;
- while (start < end) {
- struct page *page = NULL;
- pte++;
- if (pte_present(*pte))
- page = vm_normal_page(vma, start, *pte);
- /*
- * Break if page could not be obtained or the page's node+zone does not
- * match
- */
- if (!page || page_zone_id(page) != zoneid)
- break;
+ make_pages_present(start, end);
- get_page(page);
- /*
- * Increase the address that will be returned *before* the
- * eventual break due to pvec becoming full by adding the page
- */
- start += PAGE_SIZE;
- if (pagevec_add(pvec, page) == 0)
- break;
- }
- pte_unmap_unlock(pte, ptl);
- return start;
+no_mlock:
+ vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
+ return nr_pages; /* error or pages NOT mlocked */
}
/*
@@ -471,17 +273,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- vma->vm_flags &= ~VM_LOCKED;
+ unsigned long addr;
- while (start < end) {
- struct page *page = NULL;
- unsigned int page_mask;
- unsigned long page_increm;
- struct pagevec pvec;
- struct zone *zone;
- int zoneid;
+ lru_add_drain();
+ vma->vm_flags &= ~VM_LOCKED;
- pagevec_init(&pvec, 0);
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ struct page *page;
/*
* Although FOLL_DUMP is intended for get_dump_page(),
* it just so happens that its special treatment of the
@@ -489,48 +287,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
* suits munlock very well (and if somehow an abnormal page
* has sneaked into the range, we won't oops here: great).
*/
- page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
- &page_mask);
-
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
- lock_page(page);
- /*
- * Any THP page found by follow_page_mask() may
- * have gotten split before reaching
- * munlock_vma_page(), so we need to recompute
- * the page_mask here.
- */
- page_mask = munlock_vma_page(page);
- unlock_page(page);
- put_page(page); /* follow_page_mask() */
- } else {
- /*
- * Non-huge pages are handled in batches via
- * pagevec. The pin from follow_page_mask()
- * prevents them from collapsing by THP.
- */
- pagevec_add(&pvec, page);
- zone = page_zone(page);
- zoneid = page_zone_id(page);
-
- /*
- * Try to fill the rest of pagevec using fast
- * pte walk. This will also update start to
- * the next page to process. Then munlock the
- * pagevec.
- */
- start = __munlock_pagevec_fill(&pvec, vma,
- zoneid, start, end);
- __munlock_pagevec(&pvec, zone);
- goto next;
- }
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
- page_increm = 1 + page_mask;
- start += page_increm * PAGE_SIZE;
-next:
cond_resched();
}
}
@@ -540,7 +303,7 @@ next:
*
* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
* munlock is a no-op. However, for some special vmas, we go ahead and
- * populate the ptes.
+ * populate the ptes via make_pages_present().
*
* For vmas that pass the filters, merge/split as appropriate.
*/
@@ -628,9 +391,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (on)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!on)
+ newflags &= ~VM_LOCKED;
tmp = vma->vm_end;
if (tmp > end)
@@ -653,20 +416,13 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL;
int locked = 0;
- long ret = 0;
+ int ret = 0;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -727,24 +483,22 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
lru_add_drain_all(); /* flush pagevec */
+ down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
locked = len >> PAGE_SHIFT;
-
- down_write(&current->mm->mmap_sem);
-
locked += current->mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
-
up_write(&current->mm->mmap_sem);
if (!error)
- error = __mm_populate(start, len, 0);
+ error = do_mlock_pages(start, len, 0);
return error;
}
@@ -752,37 +506,34 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
+ down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
-
- down_write(&current->mm->mmap_sem);
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
-
return ret;
}
static int do_mlockall(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
+ unsigned int def_flags = 0;
if (flags & MCL_FUTURE)
- current->mm->def_flags |= VM_LOCKED;
- else
- current->mm->def_flags &= ~VM_LOCKED;
+ def_flags = VM_LOCKED;
+ current->mm->def_flags = def_flags;
if (flags == MCL_FUTURE)
goto out;
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
vm_flags_t newflags;
- newflags = vma->vm_flags & ~VM_LOCKED;
- if (flags & MCL_CURRENT)
- newflags |= VM_LOCKED;
+ newflags = vma->vm_flags | VM_LOCKED;
+ if (!(flags & MCL_CURRENT))
+ newflags &= ~VM_LOCKED;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
- cond_resched();
}
out:
return 0;
@@ -803,18 +554,20 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
+ down_write(&current->mm->mmap_sem);
+
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
- down_write(&current->mm->mmap_sem);
-
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
- if (!ret && (flags & MCL_CURRENT))
- mm_populate(0, TASK_SIZE);
+ if (!ret && (flags & MCL_CURRENT)) {
+ /* Ignore errors */
+ do_mlock_pages(0, TASK_SIZE, 1);
+ }
out:
return ret;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936..1ffd97ae26d 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,8 +9,6 @@
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/export.h>
-#include <linux/memory.h>
-#include <linux/notifier.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -71,41 +69,34 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ "Section %d Node %d Zone %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d\n",
+ "Section %d Node %d Zone %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
- ZONES_SHIFT,
- LAST_CPUPID_SHIFT);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ ZONES_SHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+ "Section %lu Node %lu Zone %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
- (unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_CPUPID_PGSHIFT);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
- "Node/Zone ID: %lu -> %lu\n",
- (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
- (unsigned long)ZONEID_PGOFF);
+ (unsigned long)ZONES_PGSHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+ "Zone ID: %lu -> %lu\n",
+ (unsigned long)ZONEID_PGOFF,
+ (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
- "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
+ "location: %d -> %d unused %d -> %d flags %d -> %d\n",
shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
#ifdef NODE_NOT_IN_PAGE_FLAGS
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
"Node not in page flags");
#endif
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
- "Last cpupid not in page flags");
-#endif
if (SECTIONS_WIDTH) {
shift -= SECTIONS_WIDTH;
@@ -149,51 +140,6 @@ early_param("mminit_loglevel", set_mminit_loglevel);
struct kobject *mm_kobj;
EXPORT_SYMBOL_GPL(mm_kobj);
-#ifdef CONFIG_SMP
-s32 vm_committed_as_batch = 32;
-
-static void __meminit mm_compute_batch(void)
-{
- u64 memsized_batch;
- s32 nr = num_present_cpus();
- s32 batch = max_t(s32, nr*2, 32);
-
- /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
-
- vm_committed_as_batch = max_t(s32, memsized_batch, batch);
-}
-
-static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
- unsigned long action, void *arg)
-{
- switch (action) {
- case MEM_ONLINE:
- case MEM_OFFLINE:
- mm_compute_batch();
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
-static int __init mm_compute_batch_init(void)
-{
- mm_compute_batch();
- register_hotmemory_notifier(&compute_batch_nb);
-
- return 0;
-}
-
-__initcall(mm_compute_batch_init);
-
-#endif
-
static int __init mm_sysfs_init(void)
{
mm_kobj = kobject_create_and_add("mm", kernel_kobj);
@@ -202,4 +148,5 @@ static int __init mm_sysfs_init(void)
return 0;
}
-postcore_initcall(mm_sysfs_init);
+
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 129b847d30c..35730ee9d51 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,13 +6,9 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -36,10 +32,6 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
-#include <linux/sched/sysctl.h>
-#include <linux/notifier.h>
-#include <linux/memory.h>
-#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -90,10 +82,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
@@ -132,7 +121,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ unsigned long free, allowed;
vm_acct_memory(pages);
@@ -154,7 +143,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
free -= global_page_state(NR_SHMEM);
- free += get_nr_swap_pages();
+ free += nr_swap_pages;
/*
* Any slabs which are created with the
@@ -173,10 +162,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Reserve some for root
+ * Leave the last 3% for root
*/
if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ free -= free / 32;
if (free > pages)
return 0;
@@ -184,20 +173,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = vm_commit_limit();
+ allowed = (totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100;
/*
- * Reserve some for root
+ * Leave the last 3% for root
*/
if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
- }
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ if (mm)
+ allowed -= mm->total_vm / 32;
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -214,7 +202,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file_inode(file)->i_writecount);
+ atomic_inc(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
@@ -267,7 +255,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
- bool populate;
down_write(&mm->mmap_sem);
@@ -317,15 +304,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
-
set_brk:
mm->brk = brk;
- populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- up_write(&mm->mmap_sem);
- if (populate)
- mm_populate(oldbrk, newbrk - oldbrk);
- return brk;
-
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
@@ -364,20 +344,20 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- pr_info("vm_end %lx < vm_start %lx\n",
+ printk("vm_end %lx < vm_start %lx\n",
vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_info("free gap %lx, correct %lx\n",
+ printk("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
@@ -391,7 +371,7 @@ static int browse_rb(struct rb_root *root)
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- pr_info("backwards %d, forwards %d\n", j, i);
+ printk("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
@@ -409,7 +389,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
}
}
-static void validate_mm(struct mm_struct *mm)
+void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
@@ -426,17 +406,17 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- pr_info("map_count %d vm_next %d\n", mm->map_count, i);
+ printk("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- pr_info("mm->highest_vm_end %lx, found %lx\n",
+ printk("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- pr_info("map_count %d rb %d\n", mm->map_count, i);
+ printk("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
@@ -554,34 +534,6 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
-{
- unsigned long nr_pages = 0;
- struct vm_area_struct *vma;
-
- /* Find first overlaping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
-
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
-
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
-
- if (vma->vm_start > end)
- break;
-
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
- }
-
- return nr_pages;
-}
-
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -615,7 +567,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ atomic_dec(&file->f_path.dentry->d_inode->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
@@ -643,10 +595,11 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct address_space *mapping = NULL;
- if (vma->vm_file) {
+ if (vma->vm_file)
mapping = vma->vm_file->f_mapping;
+
+ if (mapping)
mutex_lock(&mapping->i_mmap_mutex);
- }
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
@@ -684,9 +637,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
+ if (mm->mmap_cache == vma)
+ mm->mmap_cache = prev;
}
/*
@@ -848,7 +800,7 @@ again: remove_next = 1 + (end > next->vm_end);
anon_vma_interval_tree_post_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_post_update_vma(next);
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
}
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
@@ -898,15 +850,7 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- /*
- * VM_SOFTDIRTY should not prevent from VMA merging, if we
- * match the flags but dirty bit -- the caller should mark
- * merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressue on the memory system forcing
- * the kernel to generate new VMAs when old one could be
- * extended instead.
- */
- if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
+ if (vma->vm_flags ^ vm_flags)
return 0;
if (vma->vm_file != file)
return 0;
@@ -966,7 +910,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
if (is_mergeable_vma(vma, file, vm_flags) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
- vm_pglen = vma_pages(vma);
+ vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
return 1;
}
@@ -1095,7 +1039,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -1203,38 +1147,18 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-static inline int mlock_future_check(struct mm_struct *mm,
- unsigned long flags,
- unsigned long len)
-{
- unsigned long locked, lock_limit;
-
- /* mlock MCL_FUTURE? */
- if (flags & VM_LOCKED) {
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- return 0;
-}
-
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate)
+ unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
+ struct inode *inode;
vm_flags_t vm_flags;
- *populate = 0;
-
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
@@ -1282,12 +1206,20 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- if (mlock_future_check(mm, vm_flags, len))
- return -EAGAIN;
+ /* mlock MCL_FUTURE? */
+ if (vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
- if (file) {
- struct inode *inode = file_inode(file);
+ inode = file ? file->f_path.dentry->d_inode : NULL;
+ if (file) {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -1303,7 +1235,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/*
* Make sure there are no mandatory locks on the file.
*/
- if (locks_verify_locked(file))
+ if (locks_verify_locked(inode))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
@@ -1320,10 +1252,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- return -EINVAL;
break;
default:
@@ -1332,8 +1262,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- return -EINVAL;
/*
* Ignore pgoff.
*/
@@ -1351,26 +1279,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
}
}
- /*
- * Set 'VM_NORESERVE' if we should not account for the
- * memory use of this mapping.
- */
- if (flags & MAP_NORESERVE) {
- /* We honor MAP_NORESERVE if allowed to overcommit */
- if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
- vm_flags |= VM_NORESERVE;
-
- /* hugetlb applies strict overcommit unless MAP_NORESERVE */
- if (file && is_file_hugepages(file))
- vm_flags |= VM_NORESERVE;
- }
-
- addr = mmap_region(file, addr, len, vm_flags, pgoff);
- if (!IS_ERR_VALUE(addr) &&
- ((vm_flags & VM_LOCKED) ||
- (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
- *populate = len;
- return addr;
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1382,30 +1291,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
+ if (unlikely(flags & MAP_HUGETLB))
+ return -EINVAL;
file = fget(fd);
if (!file)
goto out;
- if (is_file_hugepages(file))
- len = ALIGN(len, huge_page_size(hstate_file(file)));
- retval = -EINVAL;
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
- goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
- struct hstate *hs;
-
- hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
- if (!hs)
- return -EINVAL;
-
- len = ALIGN(len, huge_page_size(hs));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1416,7 +1315,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-out_fput:
if (file)
fput(file);
out:
@@ -1496,30 +1394,16 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
}
unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+ unsigned long len, unsigned long flags,
+ vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
+ int correct_wcount = 0;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- if (!(vm_flags & MAP_FIXED))
- return -ENOMEM;
-
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
+ struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
/* Clear old maps */
error = -ENOMEM;
@@ -1530,6 +1414,24 @@ munmap_back:
goto munmap_back;
}
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ return -ENOMEM;
+
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if ((flags & MAP_NORESERVE)) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
+
/*
* Private writable mapping: check memory availability
*/
@@ -1566,11 +1468,16 @@ munmap_back:
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
+
if (file) {
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
+ correct_wcount = 1;
}
vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
@@ -1587,8 +1494,11 @@ munmap_back:
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
+ if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
+ goto free_vma;
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
@@ -1610,39 +1520,29 @@ munmap_back:
}
vma_link(mm, vma, prev, rb_link, rb_parent);
- /* Once vma denies write, undo our temporary denial count */
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
file = vma->vm_file;
+
+ /* Once vma denies write, undo our temporary denial count */
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
out:
perf_event_mmap(vma);
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm)))
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
mm->locked_vm += (len >> PAGE_SHIFT);
- else
- vma->vm_flags &= ~VM_LOCKED;
- }
+ } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+ make_pages_present(addr, addr + len);
if (file)
uprobe_mmap(vma);
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
return addr;
unmap_and_free_vma:
- if (vm_flags & VM_DENYWRITE)
- allow_write_access(file);
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
vma->vm_file = NULL;
fput(file);
@@ -1877,7 +1777,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1886,20 +1786,29 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
- info.low_limit = mm->mmap_base;
+ info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
#endif
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
+{
+ /*
+ * Is this a new hole at the lowest possible address?
+ */
+ if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
+ mm->free_area_cache = addr;
+}
+
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
@@ -1916,7 +1825,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1926,14 +1835,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -1956,6 +1865,19 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
}
#endif
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
+{
+ /*
+ * Is this a new hole at the highest possible address?
+ */
+ if (addr > mm->free_area_cache)
+ mm->free_area_cache = addr;
+
+ /* dont allow allocations above current base */
+ if (mm->free_area_cache > mm->mmap_base)
+ mm->free_area_cache = mm->mmap_base;
+}
+
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
@@ -1972,7 +1894,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
- if (file && file->f_op->get_unmapped_area)
+ if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
@@ -1993,33 +1915,37 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = NULL;
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
+ return NULL;
- while (rb_node) {
- struct vm_area_struct *tmp;
+ /* Check the cache first. */
+ /* (Cache hit rate is typically around 35%.) */
+ vma = mm->mmap_cache;
+ if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+ struct rb_node *rb_node;
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
+ while (rb_node) {
+ struct vm_area_struct *vma_tmp;
+
+ vma_tmp = rb_entry(rb_node,
+ struct vm_area_struct, vm_rb);
+
+ if (vma_tmp->vm_end > addr) {
+ vma = vma_tmp;
+ if (vma_tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
+ }
+ if (vma)
+ mm->mmap_cache = vma;
}
-
- if (vma)
- vmacache_update(addr, vma);
return vma;
}
@@ -2243,28 +2169,9 @@ int expand_downwards(struct vm_area_struct *vma,
return error;
}
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
#ifdef CONFIG_STACK_GROWSUP
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *next;
-
- address &= PAGE_MASK;
- next = vma->vm_next;
- if (next && next->vm_start == address + PAGE_SIZE) {
- if (!(next->vm_flags & VM_GROWSUP))
- return -ENOMEM;
- }
return expand_upwards(vma, address);
}
@@ -2279,21 +2186,14 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!prev || expand_stack(prev, addr))
return NULL;
- if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ if (prev->vm_flags & VM_LOCKED) {
+ mlock_vma_pages_range(prev, addr, prev->vm_end);
+ }
return prev;
}
#else
int expand_stack(struct vm_area_struct *vma, unsigned long address)
{
- struct vm_area_struct *prev;
-
- address &= PAGE_MASK;
- prev = vma->vm_prev;
- if (prev && prev->vm_end == address) {
- if (!(prev->vm_flags & VM_GROWSDOWN))
- return -ENOMEM;
- }
return expand_downwards(vma, address);
}
@@ -2314,8 +2214,9 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
- if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ if (vma->vm_flags & VM_LOCKED) {
+ mlock_vma_pages_range(vma, addr, start);
+ }
return vma;
}
#endif
@@ -2357,11 +2258,11 @@ static void unmap_region(struct mm_struct *mm,
struct mmu_gather tlb;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, 0);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : 0);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2375,6 +2276,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
+ unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
@@ -2391,9 +2293,12 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
+ if (mm->unmap_area == arch_unmap_area)
+ addr = prev ? prev->vm_end : mm->mmap_base;
+ else
+ addr = vma ? vma->vm_start : mm->mmap_base;
+ mm->unmap_area(mm, addr);
+ mm->mmap_cache = NULL; /* Kill the cache. */
}
/*
@@ -2403,6 +2308,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
{
+ struct mempolicy *pol;
struct vm_area_struct *new;
int err = -ENOMEM;
@@ -2426,9 +2332,12 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- err = vma_dup_policy(vma, new);
- if (err)
+ pol = mpol_dup(vma_policy(vma));
+ if (IS_ERR(pol)) {
+ err = PTR_ERR(pol);
goto out_free_vma;
+ }
+ vma_set_policy(new, pol);
if (anon_vma_clone(new, vma))
goto out_free_mpol;
@@ -2456,7 +2365,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
fput(new->vm_file);
unlink_anon_vmas(new);
out_free_mpol:
- mpol_put(vma_policy(new));
+ mpol_put(pol);
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
out_err:
@@ -2615,9 +2524,18 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (error & ~PAGE_MASK)
return error;
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ /*
+ * mlock MCL_FUTURE?
+ */
+ if (mm->def_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
/*
* mm->mmap_sem is required to protect against another thread
@@ -2671,9 +2589,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
- mm->locked_vm += (len >> PAGE_SHIFT);
- vma->vm_flags |= VM_SOFTDIRTY;
+ if (flags & VM_LOCKED) {
+ if (!mlock_vma_pages_range(vma, addr, addr + len))
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
return addr;
}
@@ -2681,14 +2600,10 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
unsigned long ret;
- bool populate;
down_write(&mm->mmap_sem);
ret = do_brk(addr, len);
- populate = ((mm->def_flags & VM_LOCKED) != 0);
up_write(&mm->mmap_sem);
- if (populate)
- mm_populate(addr, len);
return ret;
}
EXPORT_SYMBOL(vm_brk);
@@ -2720,12 +2635,12 @@ void exit_mmap(struct mm_struct *mm)
lru_add_drain();
flush_cache_mm(mm);
- tlb_gather_mmu(&tlb, mm, 0, -1);
+ tlb_gather_mmu(&tlb, mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(&tlb, 0, -1);
/*
@@ -2739,8 +2654,7 @@ void exit_mmap(struct mm_struct *mm)
}
vm_unacct_memory(nr_accounted);
- WARN_ON(atomic_long_read(&mm->nr_ptes) >
- (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
@@ -2792,6 +2706,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
+ struct mempolicy *pol;
bool faulted_in_anon_vma = true;
/*
@@ -2836,8 +2751,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
- if (vma_dup_policy(vma, new_vma))
+ pol = mpol_dup(vma_policy(vma));
+ if (IS_ERR(pol))
goto out_free_vma;
+ vma_set_policy(new_vma, pol);
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
@@ -2852,7 +2769,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_free_mempol:
- mpol_put(vma_policy(new_vma));
+ mpol_put(pol);
out_free_vma:
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
@@ -2874,31 +2791,6 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
return 1;
}
-static int special_mapping_fault(struct vm_area_struct *vma,
- struct vm_fault *vmf);
-
-/*
- * Having a close hook prevents vma merging regardless of flags.
- */
-static void special_mapping_close(struct vm_area_struct *vma)
-{
-}
-
-static const char *special_mapping_name(struct vm_area_struct *vma)
-{
- return ((struct vm_special_mapping *)vma->vm_private_data)->name;
-}
-
-static const struct vm_operations_struct special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
- .name = special_mapping_name,
-};
-
-static const struct vm_operations_struct legacy_special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
-};
static int special_mapping_fault(struct vm_area_struct *vma,
struct vm_fault *vmf)
@@ -2914,13 +2806,7 @@ static int special_mapping_fault(struct vm_area_struct *vma,
*/
pgoff = vmf->pgoff - vma->vm_pgoff;
- if (vma->vm_ops == &legacy_special_mapping_vmops)
- pages = vma->vm_private_data;
- else
- pages = ((struct vm_special_mapping *)vma->vm_private_data)->
- pages;
-
- for (; pgoff && *pages; ++pages)
+ for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
@@ -2933,29 +2819,48 @@ static int special_mapping_fault(struct vm_area_struct *vma,
return VM_FAULT_SIGBUS;
}
-static struct vm_area_struct *__install_special_mapping(
- struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_operations_struct *ops,
- void *priv)
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
{
int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- vma->vm_ops = ops;
- vma->vm_private_data = priv;
+ vma->vm_ops = &special_mapping_vmops;
+ vma->vm_private_data = pages;
ret = insert_vm_struct(mm, vma);
if (ret)
@@ -2965,40 +2870,11 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
- return vma;
+ return 0;
out:
kmem_cache_free(vm_area_cachep, vma);
- return ERR_PTR(ret);
-}
-
-/*
- * Called with mm->mmap_sem held for writing.
- * Insert a new vma covering the given region, with the given flags.
- * Its pages are supplied by the given array of struct page *.
- * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
- * The region past the last page supplied will always produce SIGBUS.
- * The array pointer and the pages it points to are assumed to stay alive
- * for as long as this mapping might exist.
- */
-struct vm_area_struct *_install_special_mapping(
- struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, const struct vm_special_mapping *spec)
-{
- return __install_special_mapping(mm, addr, len, vm_flags,
- &special_mapping_vmops, (void *)spec);
-}
-
-int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, struct page **pages)
-{
- struct vm_area_struct *vma = __install_special_mapping(
- mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
- (void *)pages);
-
- return PTR_ERR_OR_ZERO(vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -3067,7 +2943,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* vma in this mm is backed by the same anon_vma or address_space.
*
* We can take all the locks in random order because the VM code
- * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
* takes more than one of them in a row. Secondly we're protected
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
*
@@ -3125,7 +3001,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
BUG();
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
}
}
@@ -3176,115 +3052,3 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}
-
-/*
- * Initialise sysctl_user_reserve_kbytes.
- *
- * This is intended to prevent a user from starting a single memory hogging
- * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
- * mode.
- *
- * The default value is min(3% of free memory, 128MB)
- * 128MB is enough to recover with sshd/login, bash, and top/kill.
- */
-static int init_user_reserve(void)
-{
- unsigned long free_kbytes;
-
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
-
- sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
- return 0;
-}
-subsys_initcall(init_user_reserve);
-
-/*
- * Initialise sysctl_admin_reserve_kbytes.
- *
- * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
- * to log in and kill a memory hogging process.
- *
- * Systems with more than 256MB will reserve 8MB, enough to recover
- * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
- * only reserve 3% of free pages by default.
- */
-static int init_admin_reserve(void)
-{
- unsigned long free_kbytes;
-
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
-
- sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
- return 0;
-}
-subsys_initcall(init_admin_reserve);
-
-/*
- * Reinititalise user and admin reserves if memory is added or removed.
- *
- * The default user reserve max is 128MB, and the default max for the
- * admin reserve is 8MB. These are usually, but not always, enough to
- * enable recovery from a memory hogging process using login/sshd, a shell,
- * and tools like top. It may make sense to increase or even disable the
- * reserve depending on the existence of swap or variations in the recovery
- * tools. So, the admin may have changed them.
- *
- * If memory is added and the reserves have been eliminated or increased above
- * the default max, then we'll trust the admin.
- *
- * If memory is removed and there isn't enough free memory, then we
- * need to reset the reserves.
- *
- * Otherwise keep the reserve set by the admin.
- */
-static int reserve_mem_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- unsigned long tmp, free_kbytes;
-
- switch (action) {
- case MEM_ONLINE:
- /* Default max is 128MB. Leave alone if modified by operator. */
- tmp = sysctl_user_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 17))
- init_user_reserve();
-
- /* Default max is 8MB. Leave alone if modified by operator. */
- tmp = sysctl_admin_reserve_kbytes;
- if (0 < tmp && tmp < (1UL << 13))
- init_admin_reserve();
-
- break;
- case MEM_OFFLINE:
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
-
- if (sysctl_user_reserve_kbytes > free_kbytes) {
- init_user_reserve();
- pr_info("vm.user_reserve_kbytes reset to %lu\n",
- sysctl_user_reserve_kbytes);
- }
-
- if (sysctl_admin_reserve_kbytes > free_kbytes) {
- init_admin_reserve();
- pr_info("vm.admin_reserve_kbytes reset to %lu\n",
- sysctl_admin_reserve_kbytes);
- }
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block reserve_mem_nb = {
- .notifier_call = reserve_mem_notifier,
-};
-
-static int __meminit init_reserve_notifier(void)
-{
- if (register_hotmemory_notifier(&reserve_mem_nb))
- pr_err("Failed registering memory add/remove notifier for admin reserve\n");
-
- return 0;
-}
-subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index f802c2d216a..3dcfaf4ed35 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -14,6 +14,9 @@
* use_mm
* Makes the calling kernel thread take on the specified
* mm context.
+ * Called by the retry thread execute retries within the
+ * iocb issuer's mm context, so that copy_from/to_user
+ * operations work seamlessly for aio.
* (Note: this routine is intended to be called only
* from a kernel thread context)
*/
@@ -31,9 +34,6 @@ void use_mm(struct mm_struct *mm)
tsk->mm = mm;
switch_mm(active_mm, mm, tsk);
task_unlock(tsk);
-#ifdef finish_arch_post_lock_switch
- finish_arch_post_lock_switch();
-#endif
if (active_mm != mm)
mmdrop(active_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 41cefdf0aad..8a5ac8c686b 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,6 +37,7 @@ static struct srcu_struct srcu;
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int id;
/*
@@ -44,12 +45,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
* ->release returns.
*/
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
/*
- * If ->release runs before mmu_notifier_unregister it must be
- * handled, as it's the only way for the driver to flush all
- * existing sptes and stop the driver from establishing any more
- * sptes before all the pages in the mm are freed.
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
@@ -62,22 +64,22 @@ void __mmu_notifier_release(struct mm_struct *mm)
hlist);
/*
* We arrived before mmu_notifier_unregister so
- * mmu_notifier_unregister will do nothing other than to wait
- * for ->release to finish and for mmu_notifier_unregister to
- * return.
+ * mmu_notifier_unregister will do nothing other than
+ * to wait ->release to finish and
+ * mmu_notifier_unregister to return.
*/
hlist_del_init_rcu(&mn->hlist);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * synchronize_srcu here prevents mmu_notifier_release from returning to
- * exit_mmap (which would proceed with freeing all pages in the mm)
- * until the ->release method returns, if it was invoked by
- * mmu_notifier_unregister.
+ * synchronize_srcu here prevents mmu_notifier_release to
+ * return to exit_mmap (which would proceed freeing all pages
+ * in the mm) until the ->release method returns, if it was
+ * invoked by mmu_notifier_unregister.
*
- * The mmu_notifier_mm can't go away from under us because one mm_count
- * is held by exit_mmap.
+ * The mmu_notifier_mm can't go away from under us because one
+ * mm_count is hold by exit_mmap.
*/
synchronize_srcu(&srcu);
}
@@ -91,10 +93,11 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long address)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int young = 0, id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
young |= mn->ops->clear_flush_young(mn, mm, address);
}
@@ -107,10 +110,11 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int young = 0, id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->test_young) {
young = mn->ops->test_young(mn, mm, address);
if (young)
@@ -126,10 +130,11 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
}
@@ -140,10 +145,11 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
@@ -154,31 +160,31 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);
}
srcu_read_unlock(&srcu, id);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
srcu_read_unlock(&srcu, id);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
@@ -290,32 +296,29 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (!hlist_unhashed(&mn->hlist)) {
/*
- * SRCU here will force exit_mmap to wait for ->release to
- * finish before freeing the pages.
+ * SRCU here will force exit_mmap to wait ->release to finish
+ * before freeing the pages.
*/
int id;
id = srcu_read_lock(&srcu);
/*
- * exit_mmap will block in mmu_notifier_release to guarantee
- * that ->release is called before freeing the pages.
+ * exit_mmap will block in mmu_notifier_release to
+ * guarantee ->release is called before freeing the
+ * pages.
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
srcu_read_unlock(&srcu, id);
spin_lock(&mm->mmu_notifier_mm->lock);
- /*
- * Can not use list_del_rcu() since __mmu_notifier_release
- * can delete it before we hold the lock.
- */
- hlist_del_init_rcu(&mn->hlist);
+ hlist_del_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
}
/*
- * Wait for any running method to finish, of course including
- * ->release if it was run by mmu_notifier_release instead of us.
+ * Wait any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_relase instead of us.
*/
synchronize_srcu(&srcu);
@@ -329,4 +332,5 @@ static int __init mmu_notifier_init(void)
{
return init_srcu_struct(&srcu);
}
-subsys_initcall(mmu_notifier_init);
+
+module_init(mmu_notifier_init);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556d..4596d81b89b 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
/*
* linux/mm/mmzone.c
*
- * management codes for pgdats, zones and page flags
+ * management codes for pgdats and zones.
*/
@@ -96,21 +96,3 @@ void lruvec_init(struct lruvec *lruvec)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}
-
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-int page_cpupid_xchg_last(struct page *page, int cpupid)
-{
- unsigned long old_flags, flags;
- int last_cpupid;
-
- do {
- old_flags = flags = page->flags;
- last_cpupid = page_cpupid_last(page);
-
- flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
- flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
-
- return last_cpupid;
-}
-#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c43d557941f..94722a4d6b4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,6 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
-#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -36,47 +35,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
-/*
- * For a prot_numa update we only hold mmap_sem for read so there is a
- * potential race with faulting where a pmd was temporarily none. This
- * function checks for a transhuge pmd under the appropriate lock. It
- * returns a pte if it was successfully locked or NULL if it raced with
- * a transhuge insertion.
- */
-static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, int prot_numa, spinlock_t **ptl)
-{
- pte_t *pte;
- spinlock_t *pmdl;
-
- /* !prot_numa is protected by mmap_sem held for write */
- if (!prot_numa)
- return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
-
- pmdl = pmd_lock(vma->vm_mm, pmd);
- if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
- spin_unlock(pmdl);
- return NULL;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
- spin_unlock(pmdl);
- return pte;
-}
-
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
- pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
- if (!pte)
- return 0;
-
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -84,111 +54,117 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t ptent;
bool updated = false;
+ ptent = ptep_modify_prot_start(mm, addr, pte);
if (!prot_numa) {
- ptent = ptep_modify_prot_start(mm, addr, pte);
- if (pte_numa(ptent))
- ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
- /*
- * Avoid taking write faults for pages we
- * know to be dirty.
- */
- if (dirty_accountable && pte_dirty(ptent))
- ptent = pte_mkwrite(ptent);
- ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
- if (page && !PageKsm(page)) {
- if (!pte_numa(oldpte)) {
- ptep_set_numa(mm, addr, pte);
+ if (page) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
+ ptent = pte_mknuma(ptent);
updated = true;
}
}
}
+
+ /*
+ * Avoid taking write faults for pages we know to be
+ * dirty.
+ */
+ if (dirty_accountable && pte_dirty(ptent)) {
+ ptent = pte_mkwrite(ptent);
+ updated = true;
+ }
+
if (updated)
pages++;
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
- pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
- newpte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(oldpte))
- newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(mm, addr, pte, newpte);
-
- pages++;
+ set_pte_at(mm, addr, pte,
+ swp_entry_to_pte(entry));
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+ *ret_all_same_node = all_same_node;
return pages;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
- struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
- unsigned long nr_huge_updates = 0;
- unsigned long mni_start = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
-
next = pmd_addr_end(addr, end);
- if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
- continue;
-
- /* invoke the mmu notifier if the pmd is populated */
- if (!mni_start) {
- mni_start = addr;
- mmu_notifier_invalidate_range_start(mm, mni_start, end);
- }
-
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
- else {
- int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, prot_numa);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
- pages += HPAGE_PMD_NR;
- nr_huge_updates++;
- }
-
- /* huge pmd was handled */
- continue;
- }
+ else if (change_huge_pmd(vma, pmd, addr, newprot,
+ prot_numa)) {
+ pages += HPAGE_PMD_NR;
+ continue;
}
- /* fall through, the trans huge pmd just split */
+ /* fall through */
}
- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- dirty_accountable, prot_numa);
- pages += this_pages;
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
- if (mni_start)
- mmu_notifier_invalidate_range_end(mm, mni_start, end);
-
- if (nr_huge_updates)
- count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}
@@ -225,7 +201,6 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -237,7 +212,6 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
return pages;
}
@@ -246,12 +220,15 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
+ struct mm_struct *mm = vma->vm_mm;
unsigned long pages;
+ mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return pages;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f2..e1031e1f6a6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -15,12 +15,10 @@
#include <linux/swap.h>
#include <linux/capability.h>
#include <linux/fs.h>
-#include <linux/swapops.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/sysctl.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -70,23 +68,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return pmd;
}
-static pte_t move_soft_dirty_pte(pte_t pte)
-{
- /*
- * Set soft dirty bit so we can notice
- * in userspace the ptes were moved.
- */
-#ifdef CONFIG_MEM_SOFT_DIRTY
- if (pte_present(pte))
- pte = pte_mksoft_dirty(pte);
- else if (is_swap_pte(pte))
- pte = pte_swp_mksoft_dirty(pte);
- else if (pte_file(pte))
- pte = pte_file_mksoft_dirty(pte);
-#endif
- return pte;
-}
-
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
@@ -144,7 +125,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
continue;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
- pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
}
@@ -154,7 +134,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (anon_vma)
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -194,17 +174,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
- if (extent == HPAGE_PMD_SIZE) {
- VM_BUG_ON(vma->vm_file || !vma->anon_vma);
- /* See comment in move_ptes() */
- if (need_rmap_locks)
- anon_vma_lock_write(vma->anon_vma);
+ if (extent == HPAGE_PMD_SIZE)
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
- if (need_rmap_locks)
- anon_vma_unlock_write(vma->anon_vma);
- }
if (err > 0) {
need_flush = true;
continue;
@@ -235,7 +208,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr, bool *locked)
+ unsigned long new_len, unsigned long new_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -326,7 +299,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
+ if (new_len > old_len)
+ mlock_vma_pages_range(new_vma, new_addr + old_len,
+ new_addr + new_len);
}
return new_addr;
@@ -391,8 +366,9 @@ Eagain:
return ERR_PTR(-EAGAIN);
}
-static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
- unsigned long new_addr, unsigned long new_len, bool *locked)
+static unsigned long mremap_to(unsigned long addr,
+ unsigned long old_len, unsigned long new_addr,
+ unsigned long new_len)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -442,7 +418,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (ret & ~PAGE_MASK)
goto out1;
- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr);
if (!(ret & ~PAGE_MASK))
goto out;
out1:
@@ -480,16 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
- bool locked = false;
- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
- return ret;
+ down_write(&current->mm->mmap_sem);
- if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
- return ret;
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ goto out;
if (addr & ~PAGE_MASK)
- return ret;
+ goto out;
old_len = PAGE_ALIGN(old_len);
new_len = PAGE_ALIGN(new_len);
@@ -500,13 +474,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* a zero new-len is nonsensical.
*/
if (!new_len)
- return ret;
-
- down_write(&current->mm->mmap_sem);
+ goto out;
if (flags & MREMAP_FIXED) {
- ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ if (flags & MREMAP_MAYMOVE)
+ ret = mremap_to(addr, old_len, new_addr, new_len);
goto out;
}
@@ -548,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
- locked = true;
- new_addr = addr;
+ mlock_vma_pages_range(vma, addr + old_len,
+ addr + new_len);
}
ret = addr;
goto out;
@@ -575,13 +547,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+ ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
up_write(&current->mm->mmap_sem);
- if (locked && new_len > old_len)
- mm_populate(new_addr + old_len, new_len - old_len);
return ret;
}
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d48..632df4527c0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -58,7 +58,6 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
vma = find_vma(mm, start);
for (;;) {
struct file *file;
- loff_t fstart, fend;
/* Still start < end. */
error = -ENOMEM;
@@ -78,18 +77,12 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
}
file = vma->vm_file;
- fstart = (start - vma->vm_start) +
- ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- fend = fstart + (min(end, vma->vm_end) - start) - 1;
start = vma->vm_end;
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- if (vma->vm_flags & VM_NONLINEAR)
- error = vfs_fsync(file, 1);
- else
- error = vfs_fsync_range(file, fstart, fend, 1);
+ error = vfs_fsync(file, 0);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7ed58602e71..b8294fc03df 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,15 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+ addr = memblock_find_in_range_node(goal, limit, size, align, nid);
if (!addr)
return NULL;
- if (memblock_reserve(addr, size))
- return NULL;
-
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
+ memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
@@ -84,18 +82,27 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
- int order;
+ unsigned long i, start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
- while (start < end) {
- order = min(MAX_ORDER - 1UL, __ffs(start));
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
- while (start + (1UL << order) > end)
- order--;
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
- __free_pages_bootmem(pfn_to_page(start), order);
-
- start += (1UL << order);
+ return;
}
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
}
static unsigned long __init __free_memory_core(phys_addr_t start,
@@ -113,53 +120,52 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
-static unsigned long __init free_low_memory_core_early(void)
+unsigned long __init free_low_memory_core_early(int nodeid)
{
unsigned long count = 0;
- phys_addr_t start, end;
+ phys_addr_t start, end, size;
u64 i;
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
count += __free_memory_core(start, end);
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
- {
- phys_addr_t size;
-
- /* Free memblock.reserved array if it was allocated */
- size = get_allocated_memblock_reserved_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
-
- /* Free memblock.memory array if it was allocated */
- size = get_allocated_memblock_memory_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
- }
-#endif
+ /* free range that is used for reserved array if we allocate it */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
return count;
}
-static int reset_managed_pages_done __initdata;
-
-static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- if (reset_managed_pages_done)
- return;
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->managed_pages = 0;
+ if (!is_highmem(z))
+ z->managed_pages = 0;
}
-void __init reset_all_zones_managed_pages(void)
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
- struct pglist_data *pgdat;
+ register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
- for_each_online_pgdat(pgdat)
- reset_node_managed_pages(pgdat);
- reset_managed_pages_done = 1;
+ /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
}
/**
@@ -169,19 +175,17 @@ void __init reset_all_zones_managed_pages(void)
*/
unsigned long __init free_all_bootmem(void)
{
- unsigned long pages;
+ struct pglist_data *pgdat;
- reset_all_zones_managed_pages();
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
/*
- * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
+ * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
- pages = free_low_memory_core_early();
- totalram_pages += pages;
-
- return pages;
+ return free_low_memory_core_early(MAX_NUMNODES);
}
/**
@@ -197,6 +201,7 @@ unsigned long __init free_all_bootmem(void)
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
+ kmemleak_free_part(__va(physaddr), size);
memblock_free(physaddr, size);
}
@@ -211,6 +216,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
+ kmemleak_free_part(__va(addr), size);
memblock_free(addr, size);
}
@@ -226,7 +232,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
restart:
- ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
if (ptr)
return ptr;
@@ -310,7 +316,7 @@ again:
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
goal, limit);
if (ptr)
return ptr;
@@ -332,7 +338,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
}
-static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal,
unsigned long limit)
{
@@ -400,14 +406,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
}
-void * __init __alloc_bootmem_low_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem_nopanic(size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
-}
-
/**
* __alloc_bootmem_low_node - allocate low boot memory from a specific node
* @pgdat: node to allocate from
diff --git a/mm/nommu.c b/mm/nommu.c
index 4a852f6c570..79c3cac87af 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,11 +13,8 @@
* Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/export.h>
#include <linux/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -27,14 +24,11 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
-#include <linux/compiler.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
-#include <linux/sched/sysctl.h>
-#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -61,15 +55,13 @@
void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
+unsigned long num_physpages;
unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
@@ -90,6 +82,7 @@ unsigned long vm_memory_committed(void)
EXPORT_SYMBOL_GPL(vm_memory_committed);
EXPORT_SYMBOL(mem_map);
+EXPORT_SYMBOL(num_physpages);
/* list of mapped, potentially shareable regions */
static struct kmem_cache *vm_region_jar;
@@ -146,10 +139,10 @@ unsigned int kobjsize(const void *objp)
return PAGE_SIZE << compound_order(page);
}
-long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int foll_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, unsigned int foll_flags,
+ struct page **pages, struct vm_area_struct **vmas,
+ int *retry)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -196,10 +189,9 @@ finish_or_fault:
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- int write, int force, struct page **pages,
- struct vm_area_struct **vmas)
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
{
int flags = 0;
@@ -234,7 +226,8 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL(follow_pfn);
-LIST_HEAD(vmap_area_list);
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
void vfree(const void *addr)
{
@@ -286,10 +279,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
long vread(char *buf, char *addr, unsigned long count)
{
- /* Don't allow overflow */
- if ((unsigned long) buf + count < count)
- count = -(unsigned long) buf;
-
memcpy(buf, addr, count);
return count;
}
@@ -301,7 +290,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
- return count;
+ return(count);
}
/*
@@ -464,7 +453,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __weak vmalloc_sync_all(void)
+void __attribute__((weak)) vmalloc_sync_all(void)
{
}
@@ -773,23 +762,16 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
- int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
- }
+ if (mm->mmap_cache == vma)
+ mm->mmap_cache = NULL;
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -837,8 +819,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
+ vma = mm->mmap_cache;
+ if (vma && vma->vm_start <= addr && vma->vm_end > addr)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -847,7 +829,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
+ mm->mmap_cache = vma;
return vma;
}
}
@@ -886,8 +868,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
+ vma = mm->mmap_cache;
+ if (vma && vma->vm_start == addr && vma->vm_end == end)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -898,7 +880,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- vmacache_update(addr, vma);
+ mm->mmap_cache = vma;
return vma;
}
}
@@ -950,7 +932,7 @@ static int validate_mmap_request(struct file *file,
struct address_space *mapping;
/* files must support mmap */
- if (!file->f_op->mmap)
+ if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -959,7 +941,7 @@ static int validate_mmap_request(struct file *file,
*/
mapping = file->f_mapping;
if (!mapping)
- mapping = file_inode(file)->i_mapping;
+ mapping = file->f_path.dentry->d_inode->i_mapping;
capabilities = 0;
if (mapping && mapping->backing_dev_info)
@@ -968,7 +950,7 @@ static int validate_mmap_request(struct file *file,
if (!capabilities) {
/* no explicit capabilities set, so assume some
* defaults */
- switch (file_inode(file)->i_mode & S_IFMT) {
+ switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
capabilities = BDI_CAP_MAP_COPY;
@@ -1003,11 +985,11 @@ static int validate_mmap_request(struct file *file,
!(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (IS_APPEND(file_inode(file)) &&
+ if (IS_APPEND(file->f_path.dentry->d_inode) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file))
+ if (locks_verify_locked(file->f_path.dentry->d_inode))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -1015,7 +997,8 @@ static int validate_mmap_request(struct file *file,
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
- } else {
+ }
+ else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1046,20 +1029,23 @@ static int validate_mmap_request(struct file *file,
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
- } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ }
+ else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
- } else if ((prot & PROT_READ) &&
+ }
+ else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
- } else {
+ }
+ else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
@@ -1249,7 +1235,7 @@ error_free:
return ret;
enomem:
- pr_err("Allocation of length %lu from process %d (%s) failed\n",
+ printk("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
show_free_areas(0);
return -ENOMEM;
@@ -1263,8 +1249,7 @@ unsigned long do_mmap_pgoff(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
- unsigned long pgoff,
- unsigned long *populate)
+ unsigned long pgoff)
{
struct vm_area_struct *vma;
struct vm_region *region;
@@ -1274,8 +1259,6 @@ unsigned long do_mmap_pgoff(struct file *file,
kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
- *populate = 0;
-
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1339,8 +1322,8 @@ unsigned long do_mmap_pgoff(struct file *file,
continue;
/* search for overlapping mappings on the same file */
- if (file_inode(pregion->vm_file) !=
- file_inode(file))
+ if (pregion->vm_file->f_path.dentry->d_inode !=
+ file->f_path.dentry->d_inode)
continue;
if (pregion->vm_pgoff >= pgend)
@@ -1667,7 +1650,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
- static int limit;
+ static int limit = 0;
if (limit < 5) {
printk(KERN_WARNING
"munmap of memory not mmapped by process %d"
@@ -1782,7 +1765,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
*
* MREMAP_FIXED is not supported under NOMMU conditions
*/
-static unsigned long do_mremap(unsigned long addr,
+unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
@@ -1817,6 +1800,7 @@ static unsigned long do_mremap(unsigned long addr,
vma->vm_end = vma->vm_start + new_len;
return vma->vm_start;
}
+EXPORT_SYMBOL(do_mremap);
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long, new_len, unsigned long, flags,
@@ -1830,11 +1814,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return ret;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
{
- *page_mask = 0;
return NULL;
}
@@ -1849,16 +1831,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(remap_pfn_range);
-int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
-{
- unsigned long pfn = start >> PAGE_SHIFT;
- unsigned long vm_len = vma->vm_end - vma->vm_start;
-
- pfn += vma->vm_pgoff;
- return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
-}
-EXPORT_SYMBOL(vm_iomap_memory);
-
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
@@ -1880,6 +1852,10 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
+{
+}
+
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
@@ -1905,7 +1881,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ unsigned long free, allowed;
vm_acct_memory(pages);
@@ -1927,7 +1903,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
free -= global_page_state(NR_SHMEM);
- free += get_nr_swap_pages();
+ free += nr_swap_pages;
/*
* Any slabs which are created with the
@@ -1946,10 +1922,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Reserve some for root
+ * Leave the last 3% for root
*/
if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ free -= free / 32;
if (free > pages)
return 0;
@@ -1957,20 +1933,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = vm_commit_limit();
+ allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/*
- * Reserve some 3% for root
+ * Leave the last 3% for root
*/
if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
- }
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ if (mm)
+ allowed -= mm->total_vm / 32;
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -1993,12 +1967,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- BUG();
-}
-EXPORT_SYMBOL(filemap_map_pages);
-
int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long size, pgoff_t pgoff)
{
@@ -2138,45 +2106,3 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
up_write(&nommu_region_sem);
return 0;
}
-
-/*
- * Initialise sysctl_user_reserve_kbytes.
- *
- * This is intended to prevent a user from starting a single memory hogging
- * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
- * mode.
- *
- * The default value is min(3% of free memory, 128MB)
- * 128MB is enough to recover with sshd/login, bash, and top/kill.
- */
-static int __meminit init_user_reserve(void)
-{
- unsigned long free_kbytes;
-
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
-
- sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
- return 0;
-}
-module_init(init_user_reserve)
-
-/*
- * Initialise sysctl_admin_reserve_kbytes.
- *
- * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
- * to log in and kill a memory hogging process.
- *
- * Systems with more than 256MB will reserve 8MB, enough to recover
- * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
- * only reserve 3% of free pages by default.
- */
-static int __meminit init_admin_reserve(void)
-{
- unsigned long free_kbytes;
-
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
-
- sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
- return 0;
-}
-module_init(init_admin_reserve)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3291e82d435..0399f146ae4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,21 +47,19 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @start: task struct of which task to consider
+ * @tsk: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *start,
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
const nodemask_t *mask)
{
- struct task_struct *tsk;
- bool ret = false;
+ struct task_struct *start = tsk;
- rcu_read_lock();
- for_each_thread(start, tsk) {
+ do {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -69,20 +67,19 @@ static bool has_intersects_mems_allowed(struct task_struct *start,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- ret = mempolicy_nodemask_intersects(tsk, mask);
+ if (mempolicy_nodemask_intersects(tsk, mask))
+ return true;
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- ret = cpuset_mems_allowed_intersects(current, tsk);
+ if (cpuset_mems_allowed_intersects(current, tsk))
+ return true;
}
- if (ret)
- break;
- }
- rcu_read_unlock();
+ } while_each_thread(start, tsk);
- return ret;
+ return false;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -100,21 +97,16 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t;
-
- rcu_read_lock();
+ struct task_struct *t = p;
- for_each_thread(p, t) {
+ do {
task_lock(t);
if (likely(t->mm))
- goto found;
+ return t;
task_unlock(t);
- }
- t = NULL;
-found:
- rcu_read_unlock();
+ } while_each_thread(p, t);
- return t;
+ return NULL;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -169,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes +
get_mm_counter(p->mm, MM_SWAPENTS);
task_unlock(p);
@@ -178,7 +170,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= (points * 3) / 100;
+ adj -= 30;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
@@ -296,7 +288,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
- * number of 'points'. Returns -1 on scan abort.
+ * number of 'points'.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
@@ -309,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- for_each_process_thread(g, p) {
+ do_each_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -322,20 +314,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
continue;
case OOM_SCAN_ABORT:
rcu_read_unlock();
- return (struct task_struct *)(-1UL);
+ return ERR_PTR(-1UL);
case OOM_SCAN_OK:
break;
};
points = oom_badness(p, NULL, nodemask, totalpages);
- if (!points || points < chosen_points)
- continue;
- /* Prefer thread group leaders for display purposes */
- if (points == chosen_points && thread_group_leader(chosen))
- continue;
-
- chosen = p;
- chosen_points = points;
- }
+ if (points > chosen_points) {
+ chosen = p;
+ chosen_points = points;
+ }
+ } while_each_thread(g, p);
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -376,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- atomic_long_read(&task->mm->nr_ptes),
+ task->mm->nr_ptes,
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -398,10 +386,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
- if (memcg)
- mem_cgroup_print_oom_info(memcg, p);
- else
- show_mem(SHOW_MEM_FILTER_NODES);
+ mem_cgroup_print_oom_info(memcg, p);
+ show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
dump_tasks(memcg, nodemask);
}
@@ -418,7 +404,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *t = p;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -449,7 +435,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- for_each_thread(p, t) {
+ do {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -467,11 +453,13 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- }
+ } while_each_thread(p, t);
read_unlock(&tasklist_lock);
+ rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
+ rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -497,7 +485,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
- rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
@@ -668,7 +655,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
+ if (PTR_ERR(p) != -1UL) {
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory");
killed = 1;
@@ -689,12 +676,9 @@ out:
*/
void pagefault_out_of_memory(void)
{
- struct zonelist *zonelist;
-
- if (mem_cgroup_oom_synchronize(true))
- return;
+ struct zonelist *zonelist = node_zonelist(first_online_node,
+ GFP_KERNEL);
- zonelist = node_zonelist(first_online_node, GFP_KERNEL);
if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
clear_zonelist_oom(zonelist, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 518e2c3f4c7..0713bfbf095 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,12 +35,8 @@
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
#include <linux/timer.h>
-#include <linux/sched/rt.h>
-#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
-#include "internal.h"
-
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
*/
@@ -156,6 +152,24 @@ static unsigned long writeout_period_time = 0;
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around. To avoid stressing page reclaim with lots of unreclaimable
+ * pages. It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
* free and reclaimable pages, minus some zone reserves to protect
@@ -173,26 +187,6 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
-/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- unsigned long nr_pages;
-
- nr_pages = zone_page_state(zone, NR_FREE_PAGES);
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
-
- nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
- nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
-
- return nr_pages;
-}
-
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -200,9 +194,11 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_dirtyable_memory(z);
+ x += zone_page_state(z, NR_FREE_PAGES) +
+ zone_reclaimable_pages(z) - z->dirty_balance_reserve;
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -238,12 +234,9 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES);
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
x -= min(x, dirty_balance_reserve);
- x += global_page_state(NR_INACTIVE_FILE);
- x += global_page_state(NR_ACTIVE_FILE);
-
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -292,6 +285,32 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ /*
+ * The effective global number of dirtyable pages may exclude
+ * highmem as a big-picture measure to keep the ratio between
+ * dirty memory and lowmem reasonable.
+ *
+ * But this function is purely about the individual zone and a
+ * highmem zone can hold its share of dirty pages, so we don't
+ * care about vm_highmem_is_dirtyable here.
+ */
+ unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone);
+
+ /* don't allow this to underflow */
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ return nr_pages;
+}
+
+/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
@@ -562,37 +581,6 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
}
/*
- * setpoint - dirty 3
- * f(dirty) := 1.0 + (----------------)
- * limit - setpoint
- *
- * it's a 3rd order polynomial that subjects to
- *
- * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
- * (2) f(setpoint) = 1.0 => the balance point
- * (3) f(limit) = 0 => the hard limit
- * (4) df/dx <= 0 => negative feedback control
- * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
- * => fast response on large errors; small oscillation near setpoint
- */
-static long long pos_ratio_polynom(unsigned long setpoint,
- unsigned long dirty,
- unsigned long limit)
-{
- long long pos_ratio;
- long x;
-
- x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
- limit - setpoint + 1);
- pos_ratio = x;
- pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
- pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
- pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
-
- return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
-}
-
-/*
* Dirty position control.
*
* (o) global/bdi setpoints
@@ -690,80 +678,26 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
/*
* global setpoint
*
- * See comment for pos_ratio_polynom().
- */
- setpoint = (freerun + limit) / 2;
- pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
-
- /*
- * The strictlimit feature is a tool preventing mistrusted filesystems
- * from growing a large number of dirty pages before throttling. For
- * such filesystems balance_dirty_pages always checks bdi counters
- * against bdi limits. Even if global "nr_dirty" is under "freerun".
- * This is especially important for fuse which sets bdi->max_ratio to
- * 1% by default. Without strictlimit feature, fuse writeback may
- * consume arbitrary amount of RAM because it is accounted in
- * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
*
- * Here, in bdi_position_ratio(), we calculate pos_ratio based on
- * two values: bdi_dirty and bdi_thresh. Let's consider an example:
- * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
- * limits are set by default to 10% and 20% (background and throttle).
- * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
- * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
- * about ~6K pages (as the average of background and throttle bdi
- * limits). The 3rd order polynomial will provide positive feedback if
- * bdi_dirty is under bdi_setpoint and vice versa.
+ * it's a 3rd order polynomial that subjects to
*
- * Note, that we cannot use global counters in these calculations
- * because we want to throttle process writing to a strictlimit BDI
- * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
- * in the example above).
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
*/
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- long long bdi_pos_ratio;
- unsigned long bdi_bg_thresh;
-
- if (bdi_dirty < 8)
- return min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
-
- if (bdi_dirty >= bdi_thresh)
- return 0;
-
- bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
- bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
- bdi_bg_thresh);
-
- if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
- return 0;
-
- bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
- bdi_thresh);
-
- /*
- * Typically, for strictlimit case, bdi_setpoint << setpoint
- * and pos_ratio >> bdi_pos_ratio. In the other words global
- * state ("dirty") is not limiting factor and we have to
- * make decision based on bdi counters. But there is an
- * important case when global pos_ratio should get precedence:
- * global limits are exceeded (e.g. due to activities on other
- * BDIs) while given strictlimit BDI is below limit.
- *
- * "pos_ratio * bdi_pos_ratio" would work for the case above,
- * but it would look too non-natural for the case of all
- * activity in the system coming from a single strictlimit BDI
- * with bdi->max_ratio == 100%.
- *
- * Note that min() below somewhat changes the dynamics of the
- * control system. Normally, pos_ratio value can be well over 3
- * (when globally we are at freerun and bdi is well below bdi
- * setpoint). Now the maximum pos_ratio in the same situation
- * is 2. We might want to tweak this if we observe the control
- * system is too slow to adapt.
- */
- return min(pos_ratio, bdi_pos_ratio);
- }
+ setpoint = (freerun + limit) / 2;
+ x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+ limit - setpoint + 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
/*
* We have computed basic pos_ratio above based on global situation. If
@@ -824,7 +758,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
x_intercept = bdi_setpoint + span;
if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
+ pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
x_intercept - bdi_setpoint + 1);
} else
pos_ratio /= 4;
@@ -1056,27 +990,6 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* keep that period small to reduce time lags).
*/
step = 0;
-
- /*
- * For strictlimit case, calculations above were based on bdi counters
- * and limits (starting from pos_ratio = bdi_position_ratio() and up to
- * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
- * Hence, to calculate "step" properly, we have to use bdi_dirty as
- * "dirty" and bdi_setpoint as "setpoint".
- *
- * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
- * it's possible that bdi_thresh is close to zero due to inactivity
- * of backing device (see the implementation of bdi_dirty_limit()).
- */
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- dirty = bdi_dirty;
- if (bdi_dirty < 8)
- setpoint = bdi_dirty + 1;
- else
- setpoint = (bdi_thresh +
- bdi_dirty_limit(bdi, bg_thresh)) / 2;
- }
-
if (dirty < setpoint) {
x = min(bdi->balanced_dirty_ratelimit,
min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1187,11 +1100,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
{
- unsigned long bw = bdi->avg_write_bandwidth;
- unsigned long t;
+ long bw = bdi->avg_write_bandwidth;
+ long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
@@ -1203,7 +1116,7 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
- return min_t(unsigned long, t, MAX_PAUSE);
+ return min_t(long, t, MAX_PAUSE);
}
static long bdi_min_pause(struct backing_dev_info *bdi,
@@ -1281,56 +1194,6 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
- unsigned long dirty_thresh,
- unsigned long background_thresh,
- unsigned long *bdi_dirty,
- unsigned long *bdi_thresh,
- unsigned long *bdi_bg_thresh)
-{
- unsigned long bdi_reclaimable;
-
- /*
- * bdi_thresh is not treated as some limiting factor as
- * dirty_thresh, due to reasons
- * - in JBOD setup, bdi_thresh can fluctuate a lot
- * - in a system with HDD and USB key, the USB key may somehow
- * go into state (bdi_dirty >> bdi_thresh) either because
- * bdi_dirty starts high, or because bdi_thresh drops low.
- * In this case we don't want to hard throttle the USB key
- * dirtiers for 100 seconds until bdi_dirty drops under
- * bdi_thresh. Instead the auxiliary bdi control line in
- * bdi_position_ratio() will let the dirtier task progress
- * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
- */
- *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
- if (bdi_bg_thresh)
- *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
- background_thresh,
- dirty_thresh);
-
- /*
- * In order to avoid the stacked BDI deadlock we need
- * to ensure we accurately count the 'dirty' pages when
- * the threshold is low.
- *
- * Otherwise it would be possible to get thresh+n pages
- * reported dirty, even though there are thresh-m pages
- * actually dirty; with m+n sitting in the percpu
- * deltas.
- */
- if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else {
- bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat(bdi, BDI_WRITEBACK);
- }
-}
-
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -1342,9 +1205,13 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long pages_dirtied)
{
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long bdi_reclaimable;
unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
+ unsigned long bdi_dirty;
+ unsigned long freerun;
unsigned long background_thresh;
unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
long period;
long pause;
long max_pause;
@@ -1355,16 +1222,10 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
struct backing_dev_info *bdi = mapping->backing_dev_info;
- bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
- unsigned long uninitialized_var(bdi_thresh);
- unsigned long thresh;
- unsigned long uninitialized_var(bdi_dirty);
- unsigned long dirty;
- unsigned long bg_thresh;
/*
* Unstable writes are a feature of certain networked
@@ -1378,44 +1239,61 @@ static void balance_dirty_pages(struct address_space *mapping,
global_dirty_limits(&background_thresh, &dirty_thresh);
- if (unlikely(strictlimit)) {
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, &bg_thresh);
-
- dirty = bdi_dirty;
- thresh = bdi_thresh;
- } else {
- dirty = nr_dirty;
- thresh = dirty_thresh;
- bg_thresh = background_thresh;
- }
-
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
- * when the bdi limits are ramping up in case of !strictlimit.
- *
- * In strictlimit case make decision based on the bdi counters
- * and limits. Small writeouts when the bdi limits are ramping
- * up are the price we consciously pay for strictlimit-ing.
+ * when the bdi limits are ramping up.
*/
- if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+ freerun = dirty_freerun_ceiling(dirty_thresh,
+ background_thresh);
+ if (nr_dirty <= freerun) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
current->nr_dirtied_pause =
- dirty_poll_interval(dirty, thresh);
+ dirty_poll_interval(nr_dirty, dirty_thresh);
break;
}
if (unlikely(!writeback_in_progress(bdi)))
bdi_start_background_writeback(bdi);
- if (!strictlimit)
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, NULL);
+ /*
+ * bdi_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (bdi_dirty >> bdi_thresh) either because
+ * bdi_dirty starts high, or because bdi_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until bdi_dirty drops under
+ * bdi_thresh. Instead the auxiliary bdi control line in
+ * bdi_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ */
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
+ bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else {
+ bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ bdi_dirty = bdi_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
+ }
dirty_exceeded = (bdi_dirty > bdi_thresh) &&
- ((nr_dirty > dirty_thresh) || strictlimit);
+ (nr_dirty > dirty_thresh);
if (dirty_exceeded && !bdi->dirty_exceeded)
bdi->dirty_exceeded = 1;
@@ -1544,9 +1422,9 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page)
+void set_page_dirty_balance(struct page *page, int page_mkwrite)
{
- if (set_page_dirty(page)) {
+ if (set_page_dirty(page) || page_mkwrite) {
struct address_space *mapping = page_mapping(page);
if (mapping)
@@ -1605,7 +1483,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
- p = this_cpu_ptr(&bdp_ratelimits);
+ p = &__get_cpu_var(bdp_ratelimits);
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
else if (unlikely(*p >= ratelimit_pages)) {
@@ -1617,7 +1495,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
- p = this_cpu_ptr(&dirty_throttle_leaks);
+ p = &__get_cpu_var(dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
@@ -1664,7 +1542,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+int dirty_writeback_centisecs_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
@@ -1737,7 +1615,7 @@ void writeback_set_ratelimit(void)
ratelimit_pages = 16;
}
-static int
+static int __cpuinit
ratelimit_handler(struct notifier_block *self, unsigned long action,
void *hcpu)
{
@@ -1752,7 +1630,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long action,
}
}
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
@@ -2104,8 +1982,6 @@ int __set_page_dirty_no_writeback(struct page *page)
*/
void account_page_dirtied(struct page *page, struct address_space *mapping)
{
- trace_writeback_dirty_page(page, mapping);
-
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
@@ -2120,17 +1996,11 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for set_page_writeback family.
- *
- * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
- * while calling this function.
- * See test_set_page_writeback for example.
- *
* NOTE: Unlike account_page_dirtied this does not rely on being atomic
* wrt interrupts.
*/
void account_page_writeback(struct page *page)
{
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
EXPORT_SYMBOL(account_page_writeback);
@@ -2155,12 +2025,11 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
- unsigned long flags;
if (!mapping)
return 1;
- spin_lock_irqsave(&mapping->tree_lock, flags);
+ spin_lock_irq(&mapping->tree_lock);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
@@ -2169,7 +2038,7 @@ int __set_page_dirty_nobuffers(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ spin_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -2348,10 +2217,7 @@ int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
- bool locked;
- unsigned long memcg_flags;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2372,22 +2238,17 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
-int __test_set_page_writeback(struct page *page, bool keep_write)
+int test_set_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
- bool locked;
- unsigned long memcg_flags;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2405,21 +2266,19 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- if (!keep_write)
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_TOWRITE);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
if (!ret)
account_page_writeback(page);
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
-EXPORT_SYMBOL(__test_set_page_writeback);
+EXPORT_SYMBOL(test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marked with the
@@ -2430,23 +2289,3 @@ int mapping_tagged(struct address_space *mapping, int tag)
return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
-
-/**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page: The page to wait on.
- *
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback. If so, then
- * it will wait for any pending writeback to complete.
- */
-void wait_for_stable_page(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
-
- if (!bdi_cap_stable_pages_required(bdi))
- return;
-
- wait_on_page_writeback(page);
-}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0ea758b898f..df2022ff0c8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,21 +56,13 @@
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
-#include <linux/mm_inline.h>
#include <linux/migrate.h>
#include <linux/page-debug-flags.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/rt.h>
-#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
-/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
-static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION (8)
-
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -106,9 +98,6 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
-static DEFINE_SPINLOCK(managed_page_count_lock);
-
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
/*
@@ -206,7 +195,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
int min_free_kbytes = 1024;
-int user_min_free_kbytes = -1;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -235,8 +223,8 @@ int page_group_by_mobility_disabled __read_mostly;
void set_pageblock_migratetype(struct page *page, int migratetype)
{
- if (unlikely(page_group_by_mobility_disabled &&
- migratetype < MIGRATE_PCPTYPES))
+
+ if (unlikely(page_group_by_mobility_disabled))
migratetype = MIGRATE_UNMOVABLE;
set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -251,21 +239,15 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
int ret = 0;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
- unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
- start_pfn = zone->zone_start_pfn;
- sp = zone->spanned_pages;
- if (!zone_spans_pfn(zone, pfn))
+ if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+ ret = 1;
+ else if (pfn < zone->zone_start_pfn)
ret = 1;
} while (zone_span_seqretry(zone, seq));
- if (ret)
- pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
- pfn, zone_to_nid(zone), zone->name,
- start_pfn, start_pfn + sp);
-
return ret;
}
@@ -297,8 +279,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, const char *reason,
- unsigned long bad_flags)
+static void bad_page(struct page *page)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -306,7 +287,7 @@ static void bad_page(struct page *page, const char *reason,
/* Don't complain about poisoned pages */
if (PageHWPoison(page)) {
- page_mapcount_reset(page); /* remove PageBuddy */
+ reset_page_mapcount(page); /* remove PageBuddy */
return;
}
@@ -332,14 +313,14 @@ static void bad_page(struct page *page, const char *reason,
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ dump_page(page);
print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
- page_mapcount_reset(page); /* remove PageBuddy */
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ reset_page_mapcount(page); /* remove PageBuddy */
+ add_taint(TAINT_BAD_PAGE);
}
/*
@@ -372,11 +353,9 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
+ __SetPageTail(p);
set_page_count(p, 0);
p->first_page = page;
- /* Make sure p->first_page is always valid for PageTail() */
- smp_wmb();
- __SetPageTail(p);
}
}
@@ -388,7 +367,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int bad = 0;
if (unlikely(compound_order(page) != order)) {
- bad_page(page, "wrong compound order", 0);
+ bad_page(page);
bad++;
}
@@ -397,11 +376,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p))) {
- bad_page(page, "PageTail not set", 0);
- bad++;
- } else if (unlikely(p->first_page != page)) {
- bad_page(page, "first_page not consistent", 0);
+ if (unlikely(!PageTail(p) || (p->first_page != page))) {
+ bad_page(page);
bad++;
}
__ClearPageTail(p);
@@ -410,8 +386,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
return bad;
}
-static inline void prep_zero_page(struct page *page, unsigned int order,
- gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
{
int i;
@@ -455,7 +430,7 @@ static inline void set_page_guard_flag(struct page *page) { }
static inline void clear_page_guard_flag(struct page *page) { }
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_page_order(struct page *page, int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -498,39 +473,27 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
- unsigned int order)
+ int order)
{
if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
- if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON(page_count(buddy) != 0);
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
- /*
- * zone check is done late to avoid uselessly
- * calculating zone/node ids for pages that could
- * never merge.
- */
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
-
+ VM_BUG_ON(page_count(buddy) != 0);
return 1;
}
return 0;
@@ -549,9 +512,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
+ * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -562,7 +524,6 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
*/
static inline void __free_one_page(struct page *page,
- unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
@@ -571,18 +532,16 @@ static inline void __free_one_page(struct page *page,
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- VM_BUG_ON(!zone_is_initialized(zone));
-
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
return;
VM_BUG_ON(migratetype == -1);
- page_idx = pfn & ((1 << MAX_ORDER) - 1);
+ page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
- VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
- VM_BUG_ON_PAGE(bad_range(zone, page), page);
+ VM_BUG_ON(page_idx & ((1 << order) - 1));
+ VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
buddy_idx = __find_buddy_index(page_idx, order);
@@ -638,26 +597,15 @@ out:
static inline int free_pages_check(struct page *page)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
-
- if (unlikely(page_mapcount(page)))
- bad_reason = "nonzero mapcount";
- if (unlikely(page->mapping != NULL))
- bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
- bad_reason = "nonzero _count";
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
- }
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (atomic_read(&page->_count) != 0) |
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+ (mem_cgroup_bad_page_check(page)))) {
+ bad_page(page);
return 1;
}
- page_cpupid_reset_last(page);
+ reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -682,6 +630,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
int to_free = count;
spin_lock(&zone->lock);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (to_free) {
@@ -714,9 +663,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
list_del(&page->lru);
mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ __free_one_page(page, zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
- if (likely(!is_migrate_isolate_page(page))) {
+ if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
if (is_migrate_cma(mt))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -726,16 +675,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_unlock(&zone->lock);
}
-static void free_one_page(struct zone *zone,
- struct page *page, unsigned long pfn,
- unsigned int order,
+static void free_one_page(struct zone *zone, struct page *page, int order,
int migratetype)
{
spin_lock(&zone->lock);
+ zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __free_one_page(page, pfn, zone, order, migratetype);
- if (unlikely(!is_migrate_isolate(migratetype)))
+ __free_one_page(page, zone, order, migratetype);
+ if (unlikely(migratetype != MIGRATE_ISOLATE))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
@@ -756,8 +704,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
return false;
if (!PageHighMem(page)) {
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE << order);
+ debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
@@ -771,41 +718,47 @@ static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
- unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order))
return;
- migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
+ migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
+ free_one_page(page_zone(page), page, order, migratetype);
local_irq_restore(flags);
}
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+/*
+ * Read access to zone->managed_pages is safe because it's unsigned long,
+ * but we still need to serialize writers. Currently all callers of
+ * __free_pages_bootmem() except put_page_bootmem() should only be used
+ * at boot time. So for shorter boot time, we shift the burden to
+ * put_page_bootmem() to serialize writers.
+ */
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
- struct page *p = page;
unsigned int loop;
- prefetchw(p);
- for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
- prefetchw(p + 1);
+ prefetchw(page);
+ for (loop = 0; loop < nr_pages; loop++) {
+ struct page *p = &page[loop];
+
+ if (loop + 1 < nr_pages)
+ prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
- __ClearPageReserved(p);
- set_page_count(p, 0);
- page_zone(page)->managed_pages += nr_pages;
+ page_zone(page)->managed_pages += 1 << order;
set_page_refcounted(page);
__free_pages(page, order);
}
#ifdef CONFIG_CMA
-/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
@@ -816,22 +769,10 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
+ set_page_refcounted(page);
set_pageblock_migratetype(page, MIGRATE_CMA);
-
- if (pageblock_order >= MAX_ORDER) {
- i = pageblock_nr_pages;
- p = page;
- do {
- set_page_refcounted(p);
- __free_pages(p, MAX_ORDER - 1);
- p += MAX_ORDER_NR_PAGES;
- } while (i -= MAX_ORDER_NR_PAGES);
- } else {
- set_page_refcounted(page);
- __free_pages(page, pageblock_order);
- }
-
- adjust_managed_page_count(page, pageblock_nr_pages);
+ __free_pages(page, pageblock_order);
+ totalram_pages += pageblock_nr_pages;
}
#endif
@@ -859,7 +800,7 @@ static inline void expand(struct zone *zone, struct page *page,
area--;
high--;
size >>= 1;
- VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
+ VM_BUG_ON(bad_range(zone, &page[size]));
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
@@ -889,29 +830,18 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
-
- if (unlikely(page_mapcount(page)))
- bad_reason = "nonzero mapcount";
- if (unlikely(page->mapping != NULL))
- bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
- bad_reason = "nonzero _count";
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
- }
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
- if (unlikely(bad_reason)) {
- bad_page(page, bad_reason, bad_flags);
+ if (unlikely(page_mapcount(page) |
+ (page->mapping != NULL) |
+ (atomic_read(&page->_count) != 0) |
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+ (mem_cgroup_bad_page_check(page)))) {
+ bad_page(page);
return 1;
}
return 0;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
{
int i;
@@ -945,7 +875,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
- struct free_area *area;
+ struct free_area * area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
@@ -960,7 +890,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
return page;
}
@@ -982,9 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
-#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
-#endif
};
/*
@@ -1013,7 +940,7 @@ int move_freepages(struct zone *zone,
for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1049,9 +976,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
- if (!zone_spans_pfn(zone, start_pfn))
+ if (start_pfn < zone->zone_start_pfn)
start_page = page;
- if (!zone_spans_pfn(zone, end_pfn))
+ if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
return 0;
return move_freepages(zone, start_page, end_page, migratetype);
@@ -1068,71 +995,18 @@ static void change_pageblock_range(struct page *pageblock_page,
}
}
-/*
- * If breaking a large block of pages, move all free pages to the preferred
- * allocation list. If falling back for a reclaimable kernel allocation, be
- * more aggressive about taking ownership of free pages.
- *
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
- * nor move CMA pages to different free lists. We don't want unmovable pages
- * to be allocated from MIGRATE_CMA areas.
- *
- * Returns the new migratetype of the pageblock (or the same old migratetype
- * if it was unchanged).
- */
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
-{
- int current_order = page_order(page);
-
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself. We also ensure the freepage_migratetype
- * is set to CMA so it is returned to the correct freelist in case
- * the page ends up being not actually allocated from the pcp lists.
- */
- if (is_migrate_cma(fallback_type))
- return fallback_type;
-
- /* Take ownership for orders >= pageblock_order */
- if (current_order >= pageblock_order) {
- change_pageblock_range(page, current_order, start_type);
- return start_type;
- }
-
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- page_group_by_mobility_disabled) {
- int pages;
-
- pages = move_freepages_block(zone, page, start_type);
-
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled) {
-
- set_pageblock_migratetype(page, start_type);
- return start_type;
- }
-
- }
-
- return fallback_type;
-}
-
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
- struct free_area *area;
- unsigned int current_order;
+ struct free_area * area;
+ int current_order;
struct page *page;
- int migratetype, new_type, i;
+ int migratetype, i;
/* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1;
- current_order >= order && current_order <= MAX_ORDER-1;
- --current_order) {
+ for (current_order = MAX_ORDER-1; current_order >= order;
+ --current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
@@ -1148,25 +1022,51 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct page, lru);
area->nr_free--;
- new_type = try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
+ /*
+ * If breaking a large block of pages, move all free
+ * pages to the preferred allocation list. If falling
+ * back for a reclaimable kernel allocation, be more
+ * aggressive about taking ownership of free pages
+ *
+ * On the other hand, never change migration
+ * type of MIGRATE_CMA pageblocks nor move CMA
+ * pages on different free lists. We don't
+ * want unmovable pages to be allocated from
+ * MIGRATE_CMA areas.
+ */
+ if (!is_migrate_cma(migratetype) &&
+ (unlikely(current_order >= pageblock_order / 2) ||
+ start_migratetype == MIGRATE_RECLAIMABLE ||
+ page_group_by_mobility_disabled)) {
+ int pages;
+ pages = move_freepages_block(zone, page,
+ start_migratetype);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page,
+ start_migratetype);
+
+ migratetype = start_migratetype;
+ }
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
+ /* Take ownership for orders >= pageblock_order */
+ if (current_order >= pageblock_order &&
+ !is_migrate_cma(migratetype))
+ change_pageblock_range(page, current_order,
+ start_migratetype);
+
expand(zone, page, order, current_order, area,
- new_type);
- /* The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages. This is OK as long as it does
- * not differ for MIGRATE_CMA type.
- */
- set_freepage_migratetype(page, new_type);
+ is_migrate_cma(migratetype)
+ ? migratetype : start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype, new_type);
+ start_migratetype, migratetype);
return page;
}
@@ -1212,9 +1112,9 @@ retry_reserve:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, bool cold)
+ int migratetype, int cold)
{
- int i;
+ int mt = migratetype, i;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -1231,12 +1131,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* merge IO requests if the physical pages are ordered
* properly.
*/
- if (likely(!cold))
+ if (likely(cold == 0))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
+ if (IS_ENABLED(CONFIG_CMA)) {
+ mt = get_pageblock_migratetype(page);
+ if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+ mt = migratetype;
+ }
+ set_freepage_migratetype(page, mt);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(mt))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1258,12 +1164,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain;
- unsigned long batch;
local_irq_save(flags);
- batch = ACCESS_ONCE(pcp->batch);
- if (pcp->count >= batch)
- to_drain = batch;
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
else
to_drain = pcp->count;
if (to_drain > 0) {
@@ -1360,15 +1264,15 @@ void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn;
unsigned long flags;
- unsigned int order, t;
+ int order, t;
struct list_head *curr;
- if (zone_is_empty(zone))
+ if (!zone->spanned_pages)
return;
spin_lock_irqsave(&zone->lock, flags);
- max_zone_pfn = zone_end_pfn(zone);
+ max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
@@ -1392,20 +1296,19 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
- * cold == true ? free a cold page : free a hot page
+ * cold == 1 ? free a cold page : free a hot page
*/
-void free_hot_cold_page(struct page *page, bool cold)
+void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pages_prepare(page, 0))
return;
- migratetype = get_pfnblock_migratetype(page, pfn);
+ migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -1418,23 +1321,22 @@ void free_hot_cold_page(struct page *page, bool cold)
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
- if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+ free_one_page(zone, page, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (!cold)
- list_add(&page->lru, &pcp->lists[migratetype]);
- else
+ if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ else
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- unsigned long batch = ACCESS_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
- pcp->count -= batch;
+ free_pcppages_bulk(zone, pcp->batch, pcp);
+ pcp->count -= pcp->batch;
}
out:
@@ -1444,7 +1346,7 @@ out:
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_hot_cold_page_list(struct list_head *list, int cold)
{
struct page *page, *next;
@@ -1466,8 +1368,8 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- VM_BUG_ON_PAGE(!page_count(page), page);
+ VM_BUG_ON(PageCompound(page));
+ VM_BUG_ON(!page_count(page));
#ifdef CONFIG_KMEMCHECK
/*
@@ -1481,7 +1383,6 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
}
-EXPORT_SYMBOL_GPL(split_page);
static int __isolate_free_page(struct page *page, unsigned int order)
{
@@ -1494,7 +1395,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
zone = page_zone(page);
mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt)) {
+ if (mt != MIGRATE_ISOLATE) {
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
@@ -1513,7 +1414,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -1556,12 +1457,12 @@ int split_free_page(struct page *page)
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ struct zone *zone, int order, gfp_t gfp_flags,
+ int migratetype)
{
unsigned long flags;
struct page *page;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
+ int cold = !!(gfp_flags & __GFP_COLD);
again:
if (likely(order == 0)) {
@@ -1606,16 +1507,14 @@ again:
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pageblock_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
-
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
- VM_BUG_ON_PAGE(bad_range(zone, page), page);
+ VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
@@ -1706,15 +1605,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags,
- long free_pages)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
- long free_cma = 0;
free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
@@ -1724,10 +1621,9 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
- free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-
- if (free_pages - free_cma <= min + lowmem_reserve)
+ if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1742,15 +1638,15 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
return true;
}
-bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags)
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
@@ -1769,7 +1665,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
- * If the zonelist cache is present in the passed zonelist, then
+ * If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_MEMORY].)
*
@@ -1878,15 +1774,20 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
-static bool zone_local(struct zone *local_zone, struct zone *zone)
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
- return local_zone->node == zone->node;
+ return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
}
-static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+static void __paginginit init_zone_allows_reclaim(int nid)
{
- return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
- RECLAIM_DISTANCE;
+ int i;
+
+ for_each_online_node(i)
+ if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ else
+ zone_reclaim_mode = 1;
}
#else /* CONFIG_NUMA */
@@ -1910,16 +1811,14 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
-static bool zone_local(struct zone *local_zone, struct zone *zone)
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
-static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+static inline void init_zone_allows_reclaim(int nid)
{
- return true;
}
-
#endif /* CONFIG_NUMA */
/*
@@ -1929,46 +1828,31 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int classzone_idx, int migratetype)
+ struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
+ int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
- bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE);
+ classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
- unsigned long mark;
-
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- if (cpusets_enabled() &&
- (alloc_flags & ALLOC_CPUSET) &&
+ if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
/*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(preferred_zone, zone))
- continue;
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
- }
- /*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
@@ -1994,17 +1878,18 @@ zonelist_scan:
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if (consider_zone_dirty && !zone_dirty_ok(zone))
- continue;
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+ unsigned long mark;
int ret;
- /* Checked here to keep the fast path fast */
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- if (alloc_flags & ALLOC_NO_WATERMARKS)
+ mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags))
goto try_this_zone;
if (IS_ENABLED(CONFIG_NUMA) &&
@@ -2041,24 +1926,9 @@ zonelist_scan:
continue;
default:
/* did we reclaim enough */
- if (zone_watermark_ok(zone, order, mark,
+ if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
- goto try_this_zone;
-
- /*
- * Failed to reclaim enough to meet watermark.
- * Only mark the zone full if checking the min
- * watermark or if we failed to reclaim just
- * 1<<order pages or else the page allocator
- * fastpath will prematurely mark zones full
- * when the watermark is between the low and
- * min watermarks.
- */
- if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
- ret == ZONE_RECLAIM_SOME)
goto this_zone_full;
-
- continue;
}
}
@@ -2068,7 +1938,7 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
+ if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}
@@ -2197,7 +2067,7 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ int migratetype)
{
struct page *page;
@@ -2215,7 +2085,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, classzone_idx, migratetype);
+ preferred_zone, migratetype);
if (page)
goto out;
@@ -2250,7 +2120,7 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, enum migrate_mode mode,
+ int migratetype, bool sync_migration,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
@@ -2264,7 +2134,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, mode,
+ nodemask, sync_migration,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
@@ -2278,10 +2148,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ preferred_zone, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- compaction_defer_reset(preferred_zone, order, true);
+ preferred_zone->compact_considered = 0;
+ preferred_zone->compact_defer_shift = 0;
+ if (order >= preferred_zone->compact_order_failed)
+ preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2297,7 +2170,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
- if (mode != MIGRATE_ASYNC)
+ if (sync_migration)
defer_compaction(preferred_zone, order);
cond_resched();
@@ -2310,9 +2183,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype,
- enum migrate_mode mode, bool *contended_compaction,
- bool *deferred_compaction, unsigned long *did_some_progress)
+ int migratetype, bool sync_migration,
+ bool *contended_compaction, bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2351,7 +2224,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
@@ -2369,8 +2242,7 @@ retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx,
- migratetype);
+ preferred_zone, migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2393,14 +2265,14 @@ static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2409,38 +2281,16 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void reset_alloc_batches(struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
-{
- struct zoneref *z;
- struct zone *zone;
-
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- /*
- * Only reset the batches of zones that were actually
- * considered in the fairness pass, we don't want to
- * trash fairness information for zones that are not
- * actually part of this zonelist's round-robin cycle.
- */
- if (!zone_local(preferred_zone, zone))
- continue;
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- }
-}
-
-static void wake_all_kswapds(unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
@@ -2501,14 +2351,14 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;
@@ -2532,12 +2382,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2550,18 +2401,15 @@ restart:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
- struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
- NULL, &preferred_zone);
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
- }
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+ first_zones_zonelist(zonelist, high_zoneidx, NULL,
+ &preferred_zone);
rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ preferred_zone, migratetype);
if (page)
goto got_pg;
@@ -2576,22 +2424,15 @@ rebalance:
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
+ preferred_zone, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
- if (!wait) {
- /*
- * All existing users of the deprecated __GFP_NOFAIL are
- * blockable, so warn of any new users that actually allow this
- * type of allocation to fail.
- */
- WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
+ if (!wait)
goto nopage;
- }
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
@@ -2605,23 +2446,17 @@ rebalance:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, sync_migration,
+ &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
-
- /*
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse.
- */
- if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_SYNC_LIGHT;
+ sync_migration = true;
/*
* If compaction is deferred for high-order allocations, it is because
@@ -2638,8 +2473,7 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- classzone_idx, migratetype,
- &did_some_progress);
+ migratetype, &did_some_progress);
if (page)
goto got_pg;
@@ -2648,7 +2482,7 @@ rebalance:
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
- if (oom_gfp_allowed(gfp_mask)) {
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
@@ -2658,7 +2492,7 @@ rebalance:
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
- classzone_idx, migratetype);
+ migratetype);
if (page)
goto got_pg;
@@ -2697,11 +2531,12 @@ rebalance:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, sync_migration,
+ &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
@@ -2727,12 +2562,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct zoneref *preferred_zoneref;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- int classzone_idx;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2751,53 +2585,35 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ /*
+ * Will only have any effect when __GFP_KMEMCG is set. This is
+ * verified in the (always inline) callee
+ */
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+
retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
+ cpuset_mems_cookie = get_mems_allowed();
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
-retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
- preferred_zone, classzone_idx, migratetype);
- if (unlikely(!page)) {
- /*
- * The first pass makes sure allocations are spread
- * fairly within the local node. However, the local
- * node might have free pages left after the fairness
- * batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without
- * fairness, and include remote zones now, before
- * entering the slowpath and waking kswapd: prefer
- * spilling to a remote zone over swapping locally.
- */
- if (alloc_flags & ALLOC_FAIR) {
- reset_alloc_batches(zonelist, high_zoneidx,
- preferred_zone);
- alloc_flags &= ~ALLOC_FAIR;
- goto retry;
- }
- /*
- * Runtime PM, block IO and its error handling path
- * can deadlock because I/O on the device might not
- * complete.
- */
- gfp_mask = memalloc_noio_flags(gfp_mask);
+ preferred_zone, migratetype);
+ if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
- }
+ preferred_zone, migratetype);
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2808,9 +2624,11 @@ out:
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
+ memcg_kmem_commit_charge(page, memcg, order);
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2845,7 +2663,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, false);
+ free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
@@ -2864,51 +2682,27 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
*
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
- struct mem_cgroup *memcg = NULL;
-
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
- page = alloc_pages(gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
- return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
- struct mem_cgroup *memcg = NULL;
-
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
- page = alloc_pages_node(nid, gfp_mask, order);
- memcg_kmem_commit_charge(page, memcg, order);
- return page;
-}
-
-/*
- * __free_kmem_pages and free_kmem_pages will free pages allocated with
- * alloc_kmem_pages.
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
*/
-void __free_kmem_pages(struct page *page, unsigned int order)
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
{
memcg_kmem_uncharge_pages(page, order);
__free_pages(page, order);
}
-void free_kmem_pages(unsigned long addr, unsigned int order)
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_kmem_pages(virt_to_page((void *)addr), order);
+ __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
}
}
@@ -2991,27 +2785,18 @@ void free_pages_exact(void *virt, size_t size)
}
EXPORT_SYMBOL(free_pages_exact);
-/**
- * nr_free_zone_pages - count number of pages beyond high watermark
- * @offset: The zone index of the highest zone
- *
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
- * high watermark within all zones at or below a given zone index. For each
- * zone, the number of pages is calculated as:
- * managed_pages - high_pages
- */
-static unsigned long nr_free_zone_pages(int offset)
+static unsigned int nr_free_zone_pages(int offset)
{
struct zoneref *z;
struct zone *zone;
/* Just pick one node, since fallback list is circular */
- unsigned long sum = 0;
+ unsigned int sum = 0;
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
- unsigned long size = zone->managed_pages;
+ unsigned long size = zone->present_pages;
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
@@ -3020,25 +2805,19 @@ static unsigned long nr_free_zone_pages(int offset)
return sum;
}
-/**
- * nr_free_buffer_pages - count number of pages beyond high watermark
- *
- * nr_free_buffer_pages() counts the number of pages which are beyond the high
- * watermark within ZONE_DMA and ZONE_NORMAL.
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
*/
-unsigned long nr_free_buffer_pages(void)
+unsigned int nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/**
- * nr_free_pagecache_pages - count number of pages beyond high watermark
- *
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
- * high watermark within all zones.
+/*
+ * Amount of free RAM allocatable within all zones
*/
-unsigned long nr_free_pagecache_pages(void)
+unsigned int nr_free_pagecache_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
}
@@ -3065,16 +2844,12 @@ EXPORT_SYMBOL(si_meminfo);
#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
- int zone_type; /* needs to be signed */
- unsigned long managed_pages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
- val->totalram = managed_pages;
+ val->totalram = pgdat->node_present_pages;
val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
- val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
+ val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
#else
@@ -3098,9 +2873,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
goto out;
do {
- cpuset_mems_cookie = read_mems_allowed_begin();
+ cpuset_mems_cookie = get_mems_allowed();
ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
@@ -3117,9 +2892,7 @@ static void show_migration_types(unsigned char type)
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
-#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = 'I',
-#endif
};
char tmp[MIGRATE_TYPES + 1];
char *p = tmp;
@@ -3254,7 +3027,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (!zone_reclaimable(zone) ? "yes" : "no")
+ (zone->all_unreclaimable ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3263,7 +3036,7 @@ void show_free_areas(unsigned int filter)
}
for_each_populated_zone(zone) {
- unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned long nr[MAX_ORDER], flags, order, total = 0;
unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3294,8 +3067,6 @@ void show_free_areas(unsigned int filter)
printk("= %lukB\n", K(total));
}
- hugetlb_show_meminfo();
-
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
show_swap_cache_info();
@@ -3313,10 +3084,12 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
* Add all populated zones of a node to the zonelist.
*/
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+ int nr_zones, enum zone_type zone_type)
{
struct zone *zone;
- enum zone_type zone_type = MAX_NR_ZONES;
+
+ BUG_ON(zone_type >= MAX_NR_ZONES);
+ zone_type++;
do {
zone_type--;
@@ -3326,8 +3099,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
- } while (zone_type);
+ } while (zone_type);
return nr_zones;
}
@@ -3402,7 +3175,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(struct ctl_table *table, int write,
+int numa_zonelist_order_handler(ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
@@ -3411,25 +3184,18 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
static DEFINE_MUTEX(zl_order_mutex);
mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
+ if (write)
+ strcpy(saved_string, (char*)table->data);
ret = proc_dostring(table, write, buffer, length, ppos);
if (ret)
goto out;
if (write) {
int oldval = user_zonelist_order;
-
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
+ if (__parse_numa_zonelist_order((char*)table->data)) {
/*
* bogus value. restore saved string
*/
- strncpy((char *)table->data, saved_string,
+ strncpy((char*)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
@@ -3465,7 +3231,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
- int best_node = NUMA_NO_NODE;
+ int best_node = -1;
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
@@ -3521,7 +3287,8 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
@@ -3535,7 +3302,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
@@ -3575,11 +3342,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
static int default_zonelist_order(void)
{
int nid, zone_type;
- unsigned long low_kmem_size, total_size;
+ unsigned long low_kmem_size,total_size;
struct zone *z;
int average_size;
/*
- * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
* This function detect ZONE_DMA/DMA32 size and configures zone order.
@@ -3592,8 +3359,8 @@ static int default_zonelist_order(void)
z = &NODE_DATA(nid)->node_zones[zone_type];
if (populated_zone(z)) {
if (zone_type < ZONE_NORMAL)
- low_kmem_size += z->managed_pages;
- total_size += z->managed_pages;
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
} else if (zone_type == ZONE_NORMAL) {
/*
* If any node has only lowmem, then node order
@@ -3611,9 +3378,9 @@ static int default_zonelist_order(void)
return ZONELIST_ORDER_NODE;
/*
* look into each node's config.
- * If there is a node whose DMA/DMA32 memory is very big area on
- * local memory, NODE_ORDER may be suitable.
- */
+ * If there is a node whose DMA/DMA32 memory is very big area on
+ * local memory, NODE_ORDER may be suitable.
+ */
average_size = total_size /
(nodes_weight(node_states[N_MEMORY]) + 1);
for_each_online_node(nid) {
@@ -3743,7 +3510,7 @@ static void build_zonelists(pg_data_t *pgdat)
local_node = pgdat->node_id;
zonelist = &pgdat->node_zonelists[0];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
/*
* Now we build the zonelist so that it contains the zones
@@ -3756,12 +3523,14 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j,
+ MAX_NR_ZONES - 1);
}
zonelist->_zonerefs[j].zone = NULL;
@@ -3870,12 +3639,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
} else {
+ /* we have to stop all cpus to guarantee there is no user
+ of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone);
#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
@@ -3969,6 +3738,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
return ffz(~size);
}
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
/*
* Check if a pageblock contains reserved pages
*/
@@ -3996,7 +3767,6 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
- int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -4005,7 +3775,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* the block.
*/
start_pfn = zone->zone_start_pfn;
- end_pfn = zone_end_pfn(zone);
+ end_pfn = start_pfn + zone->spanned_pages;
start_pfn = roundup(start_pfn, pageblock_nr_pages);
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
@@ -4018,12 +3788,6 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
- old_reserve = zone->nr_migrate_reserve_block;
-
- /* When memory hot-add, we almost always need to do nothing */
- if (reserve == old_reserve)
- return;
- zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -4061,12 +3825,6 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
- } else if (!old_reserve) {
- /*
- * At boot time we don't need to scan the whole zone
- * for turning off MIGRATE_RESERVE.
- */
- break;
}
/*
@@ -4113,8 +3871,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_page_links(page, zone, nid, pfn);
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
+ reset_page_mapcount(page);
+ reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4131,7 +3889,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
- && (pfn < zone_end_pfn(z))
+ && (pfn < z->zone_start_pfn + z->spanned_pages)
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
@@ -4146,7 +3904,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
static void __meminit zone_init_free_lists(struct zone *zone)
{
- unsigned int order, t;
+ int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
@@ -4158,7 +3916,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4169,7 +3927,7 @@ static int zone_batchsize(struct zone *zone)
*
* OK, so we don't know how big the cache is. So guess.
*/
- batch = zone->managed_pages / 1024;
+ batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
@@ -4208,40 +3966,7 @@ static int zone_batchsize(struct zone *zone)
#endif
}
-/*
- * pcp->high and pcp->batch values are related and dependent on one another:
- * ->batch must never be higher then ->high.
- * The following function updates them in a safe manner without read side
- * locking.
- *
- * Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording the the above rule).
- *
- * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
- * outside of boot time (or some other assurance that no concurrent updaters
- * exist).
- */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
- unsigned long batch)
-{
- /* start with a fail safe value for batch */
- pcp->batch = 1;
- smp_wmb();
-
- /* Update high, then batch, in order */
- pcp->high = high;
- smp_wmb();
-
- pcp->batch = batch;
-}
-
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
-{
- pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
-}
-
-static void pageset_init(struct per_cpu_pageset *p)
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
int migratetype;
@@ -4250,55 +3975,45 @@ static void pageset_init(struct per_cpu_pageset *p)
pcp = &p->pcp;
pcp->count = 0;
+ pcp->high = 6 * batch;
+ pcp->batch = max(1UL, 1 * batch);
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-{
- pageset_init(p);
- pageset_set_batch(p, batch);
-}
-
/*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
-static void pageset_set_high(struct per_cpu_pageset *p,
- unsigned long high)
-{
- unsigned long batch = max(1UL, high / 4);
- if ((high / 4) > (PAGE_SHIFT * 8))
- batch = PAGE_SHIFT * 8;
-
- pageset_update(&p->pcp, high, batch);
-}
-static void pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
-{
- if (percpu_pagelist_fraction)
- pageset_set_high(pcp,
- (zone->managed_pages /
- percpu_pagelist_fraction));
- else
- pageset_set_batch(pcp, zone_batchsize(zone));
-}
-
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+ unsigned long high)
{
- struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+ struct per_cpu_pages *pcp;
- pageset_init(pcp);
- pageset_set_high_and_batch(zone, pcp);
+ pcp = &p->pcp;
+ pcp->high = high;
+ pcp->batch = max(1UL, high/4);
+ if ((high/4) > (PAGE_SHIFT * 8))
+ pcp->batch = PAGE_SHIFT * 8;
}
static void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
+
zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_possible_cpu(cpu)
- zone_pageset_init(zone, cpu);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+
+ setup_pageset(pcp, zone_batchsize(zone));
+
+ if (percpu_pagelist_fraction)
+ setup_pagelist_highmark(pcp,
+ (zone->present_pages /
+ percpu_pagelist_fraction));
+ }
}
/*
@@ -4317,6 +4032,7 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
+ struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
@@ -4332,8 +4048,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- memblock_virt_alloc_node_nopanic(
- alloc_size, zone->zone_pgdat->node_id);
+ alloc_bootmem_node_nopanic(pgdat, alloc_size);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4350,7 +4065,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!zone->wait_table)
return -ENOMEM;
- for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+ for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
init_waitqueue_head(zone->wait_table + i);
return 0;
@@ -4365,7 +4080,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
*/
zone->pageset = &boot_pageset;
- if (populated_zone(zone))
+ if (zone->present_pages)
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
@@ -4400,29 +4115,20 @@ int __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ * Architectures may implement their own version but if add_active_range()
+ * was used and there are no special requirements, this is a convenient
+ * alternative
*/
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
unsigned long start_pfn, end_pfn;
- int nid;
- /*
- * NOTE: The following SMP-unsafe globals are only used early in boot
- * when the kernel is running single-threaded.
- */
- static unsigned long __meminitdata last_start_pfn, last_end_pfn;
- static int __meminitdata last_nid;
-
- if (last_start_pfn <= pfn && pfn < last_end_pfn)
- return last_nid;
-
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
- last_start_pfn = start_pfn;
- last_end_pfn = end_pfn;
- last_nid = nid;
- }
+ int i, nid;
- return nid;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ if (start_pfn <= pfn && pfn < end_pfn)
+ return nid;
+ /* This is a memory hole */
+ return -1;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
@@ -4450,13 +4156,13 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif
/**
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
+ * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
*
- * If an architecture guarantees that all ranges registered contain no holes
- * and may be freed, this this function may be used instead of calling
- * memblock_free_early_nid() manually.
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * this function may be used instead of calling free_bootmem() manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4468,9 +4174,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);
if (start_pfn < end_pfn)
- memblock_free_early_nid(PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT,
- this_nid);
+ free_bootmem_node(NODE_DATA(this_nid),
+ PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT);
}
}
@@ -4478,8 +4184,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
*
- * If an architecture guarantees that all ranges registered contain no holes and may
- * be freed, this function may be used instead of calling memory_present() manually.
+ * If an architecture guarantees that all ranges registered with
+ * add_active_ranges() contain no holes and may be freed, this
+ * function may be used instead of calling memory_present() manually.
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
@@ -4497,7 +4204,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
- * provided by memblock_set_node(). If called for a node
+ * provided by an arch calling add_active_range(). If called for a node
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
@@ -4582,13 +4289,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*/
static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
unsigned long *ignored)
{
+ unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
- /* Get the start and end of the zone */
+ /* Get the start and end of the node and zone */
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4643,14 +4350,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
unsigned long *ignored)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
+ unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
@@ -4663,8 +4370,6 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
unsigned long *zones_size)
{
return zones_size[zone_type];
@@ -4672,8 +4377,6 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
unsigned long *zholes_size)
{
if (!zholes_size)
@@ -4685,27 +4388,21 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zones_size,
- unsigned long *zholes_size)
+ unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- zones_size);
+ zones_size);
pgdat->node_spanned_pages = totalpages;
realtotalpages = totalpages;
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -=
zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn, node_end_pfn,
- zholes_size);
+ zholes_size);
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
@@ -4719,11 +4416,10 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
* round what is now in bits to nearest long in bits, then return it in
* bytes.
*/
-static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zonesize)
{
unsigned long usemapsize;
- zonesize += zone_start_pfn & (pageblock_nr_pages-1);
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4733,26 +4429,23 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
}
static void __init setup_usemap(struct pglist_data *pgdat,
- struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long zonesize)
+ struct zone *zone, unsigned long zonesize)
{
- unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
+ unsigned long usemapsize = usemap_size(zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags =
- memblock_virt_alloc_node_nopanic(usemapsize,
- pgdat->node_id);
+ zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+ usemapsize);
}
#else
-static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
- unsigned long zone_start_pfn, unsigned long zonesize) {}
+static inline void setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -4780,7 +4473,7 @@ void __paginginit set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
@@ -4815,7 +4508,6 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
@@ -4837,11 +4529,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
- node_end_pfn, zones_size);
+ size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
- node_start_pfn,
- node_end_pfn,
zholes_size);
/*
@@ -4876,7 +4565,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
nr_all_pages += freesize;
zone->spanned_pages = size;
- zone->present_pages = realsize;
+ zone->present_pages = freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
@@ -4894,17 +4583,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone_pcp_init(zone);
-
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
+ zone_pcp_init(zone);
lruvec_init(&zone->lruvec);
if (!size)
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
+ setup_usemap(pgdat, zone, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
@@ -4931,13 +4617,12 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- end = pgdat_end_pfn(pgdat);
+ end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = memblock_virt_alloc_node_nopanic(size,
- pgdat->node_id);
+ map = alloc_bootmem_node_nopanic(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4959,19 +4644,14 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = 0;
- unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-#endif
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ init_zone_allows_reclaim(nid);
+ calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4980,8 +4660,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ free_area_init_core(pgdat, zones_size, zholes_size);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -4990,7 +4669,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/*
* Figure out the number of possible node ids.
*/
-void __init setup_nr_node_ids(void)
+static void __init setup_nr_node_ids(void)
{
unsigned int node;
unsigned int highest = 0;
@@ -4999,6 +4678,10 @@ void __init setup_nr_node_ids(void)
highest = node;
nr_node_ids = highest + 1;
}
+#else
+static inline void setup_nr_node_ids(void)
+{
+}
#endif
/**
@@ -5074,7 +4757,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
* It returns the minimum PFN based on information provided via
- * memblock_set_node().
+ * add_active_range().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
@@ -5099,7 +4782,7 @@ static unsigned long __init early_calculate_totalpages(void)
if (pages)
node_set_state(nid, N_MEMORY);
}
- return totalpages;
+ return totalpages;
}
/*
@@ -5117,33 +4800,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- struct memblock_region *r;
-
- /* Need to find movable_zone earlier when movable_node is specified. */
- find_usable_zone_for_movable();
/*
- * If movable_node is specified, ignore kernelcore and movablecore
- * options.
- */
- if (movable_node_is_enabled()) {
- for_each_memblock(memory, r) {
- if (!memblock_is_hotpluggable(r))
- continue;
-
- nid = r->nid;
-
- usable_startpfn = PFN_DOWN(r->base);
- zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
- min(usable_startpfn, zone_movable_pfn[nid]) :
- usable_startpfn;
- }
-
- goto out2;
- }
-
- /*
- * If movablecore=nn[KMG] was specified, calculate what size of
+ * If movablecore was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
@@ -5169,6 +4828,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
@@ -5239,7 +4899,7 @@ restart:
/*
* Some kernelcore has been met, update counts and
* break if the kernelcore for this node has been
- * satisfied
+ * satisified
*/
required_kernelcore -= min(required_kernelcore,
size_pages);
@@ -5253,13 +4913,12 @@ restart:
* If there is still required_kernelcore, we do another pass with one
* less node in the count. This will push zone_movable_pfn[nid] further
* along on the nodes that still have memory until kernelcore is
- * satisfied
+ * satisified
*/
usable_nodes--;
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
-out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
@@ -5280,7 +4939,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (populated_zone(zone)) {
+ if (zone->present_pages) {
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)
@@ -5295,7 +4954,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by memblock_set_node(), the size of each
+ * Using the page ranges provided by add_active_range(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -5413,103 +5072,6 @@ early_param("movablecore", cmdline_parse_movablecore);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-void adjust_managed_page_count(struct page *page, long count)
-{
- spin_lock(&managed_page_count_lock);
- page_zone(page)->managed_pages += count;
- totalram_pages += count;
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages += count;
-#endif
- spin_unlock(&managed_page_count_lock);
-}
-EXPORT_SYMBOL(adjust_managed_page_count);
-
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
-{
- void *pos;
- unsigned long pages = 0;
-
- start = (void *)PAGE_ALIGN((unsigned long)start);
- end = (void *)((unsigned long)end & PAGE_MASK);
- for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
- if ((unsigned int)poison <= 0xFF)
- memset(pos, poison, PAGE_SIZE);
- free_reserved_page(virt_to_page(pos));
- }
-
- if (pages && s)
- pr_info("Freeing %s memory: %ldK (%p - %p)\n",
- s, pages << (PAGE_SHIFT - 10), start, end);
-
- return pages;
-}
-EXPORT_SYMBOL(free_reserved_area);
-
-#ifdef CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
- __free_reserved_page(page);
- totalram_pages++;
- page_zone(page)->managed_pages++;
- totalhigh_pages++;
-}
-#endif
-
-
-void __init mem_init_print_info(const char *str)
-{
- unsigned long physpages, codesize, datasize, rosize, bss_size;
- unsigned long init_code_size, init_data_size;
-
- physpages = get_num_physpages();
- codesize = _etext - _stext;
- datasize = _edata - _sdata;
- rosize = __end_rodata - __start_rodata;
- bss_size = __bss_stop - __bss_start;
- init_data_size = __init_end - __init_begin;
- init_code_size = _einittext - _sinittext;
-
- /*
- * Detect special cases and adjust section sizes accordingly:
- * 1) .init.* may be embedded into .data sections
- * 2) .init.text.* may be out of [__init_begin, __init_end],
- * please refer to arch/tile/kernel/vmlinux.lds.S.
- * 3) .rodata.* may be embedded into .text or .data sections.
- */
-#define adj_init_size(start, end, size, pos, adj) \
- do { \
- if (start <= pos && pos < end && size > adj) \
- size -= adj; \
- } while (0)
-
- adj_init_size(__init_begin, __init_end, init_data_size,
- _sinittext, init_code_size);
- adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
- adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
- adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
- adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
-
-#undef adj_init_size
-
- printk("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved"
-#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
-#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages) << (PAGE_SHIFT-10),
-#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
-#endif
- str ? ", " : "", str ? str : "");
-}
-
/**
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
@@ -5556,7 +5118,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
* This is only okay since the processor is dead and cannot
* race with what we are doing.
*/
- cpu_vm_stats_fold(cpu);
+ refresh_cpu_vm_stats(cpu);
}
return NOTIFY_OK;
}
@@ -5590,8 +5152,8 @@ static void calculate_totalreserve_pages(void)
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > zone->managed_pages)
- max = zone->managed_pages;
+ if (max > zone->present_pages)
+ max = zone->present_pages;
reserve_pages += max;
/*
* Lowmem reserves are not available to
@@ -5623,7 +5185,7 @@ static void setup_per_zone_lowmem_reserve(void)
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone->managed_pages;
+ unsigned long present_pages = zone->present_pages;
zone->lowmem_reserve[j] = 0;
@@ -5637,9 +5199,9 @@ static void setup_per_zone_lowmem_reserve(void)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
- lower_zone->lowmem_reserve[j] = managed_pages /
+ lower_zone->lowmem_reserve[j] = present_pages /
sysctl_lowmem_reserve_ratio[idx];
- managed_pages += lower_zone->managed_pages;
+ present_pages += lower_zone->present_pages;
}
}
}
@@ -5658,14 +5220,14 @@ static void __setup_per_zone_wmarks(void)
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
- lowmem_pages += zone->managed_pages;
+ lowmem_pages += zone->present_pages;
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
+ tmp = (u64)pages_min * zone->present_pages;
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
@@ -5677,10 +5239,13 @@ static void __setup_per_zone_wmarks(void)
* deltas controls asynch page reclaim, and so should
* not be capped for highmem.
*/
- unsigned long min_pages;
+ int min_pages;
- min_pages = zone->managed_pages / 1024;
- min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
+ min_pages = zone->present_pages / 1024;
+ if (min_pages < SWAP_CLUSTER_MAX)
+ min_pages = SWAP_CLUSTER_MAX;
+ if (min_pages > 128)
+ min_pages = 128;
zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
@@ -5693,11 +5258,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
-
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -5746,7 +5306,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
unsigned int gb, ratio;
/* Zone size in gigabytes */
- gb = zone->managed_pages >> (30 - PAGE_SHIFT);
+ gb = zone->present_pages >> (30 - PAGE_SHIFT);
if (gb)
ratio = int_sqrt(10 * gb);
else
@@ -5770,7 +5330,7 @@ static void __meminit setup_per_zone_inactive_ratio(void)
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
- * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
@@ -5790,21 +5350,14 @@ static void __meminit setup_per_zone_inactive_ratio(void)
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
- int new_min_free_kbytes;
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
- new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-
- if (new_min_free_kbytes > user_min_free_kbytes) {
- min_free_kbytes = new_min_free_kbytes;
- if (min_free_kbytes < 128)
- min_free_kbytes = 128;
- if (min_free_kbytes > 65536)
- min_free_kbytes = 65536;
- } else {
- pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
- new_min_free_kbytes, user_min_free_kbytes);
- }
+
+ min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 65536)
+ min_free_kbytes = 65536;
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
@@ -5814,28 +5367,21 @@ int __meminit init_per_zone_wmark_min(void)
module_init(init_per_zone_wmark_min)
/*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- int rc;
-
- rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (rc)
- return rc;
-
- if (write) {
- user_min_free_kbytes = min_free_kbytes;
+ proc_dointvec(table, write, buffer, length, ppos);
+ if (write)
setup_per_zone_wmarks();
- }
return 0;
}
#ifdef CONFIG_NUMA
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -5846,12 +5392,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
return rc;
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->min_unmapped_pages = (zone->present_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -5862,7 +5408,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
return rc;
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->min_slab_pages = (zone->present_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
@@ -5877,7 +5423,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -5887,45 +5433,29 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu
- * pagelist can have before it gets flushed back to buddy allocator.
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
+
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- int old_percpu_pagelist_fraction;
+ unsigned int cpu;
int ret;
- mutex_lock(&pcp_batch_high_lock);
- old_percpu_pagelist_fraction = percpu_pagelist_fraction;
-
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || ret < 0)
- goto out;
-
- /* Sanity checking to avoid pcp imbalance */
- if (percpu_pagelist_fraction &&
- percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
- percpu_pagelist_fraction = old_percpu_pagelist_fraction;
- ret = -EINVAL;
- goto out;
- }
-
- /* No change? */
- if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
- goto out;
-
+ if (!write || (ret < 0))
+ return ret;
for_each_populated_zone(zone) {
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
+ for_each_possible_cpu(cpu) {
+ unsigned long high;
+ high = zone->present_pages / percpu_pagelist_fraction;
+ setup_pagelist_highmark(
+ per_cpu_ptr(zone->pageset, cpu), high);
+ }
}
-out:
- mutex_unlock(&pcp_batch_high_lock);
- return ret;
+ return 0;
}
int hashdist = HASHDIST_DEFAULT;
@@ -5965,10 +5495,9 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
numentries = nr_kernel_pages;
-
- /* It isn't necessary when PAGE_SIZE >= 1MB */
- if (PAGE_SHIFT < 20)
- numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+ numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
+ numentries >>= 20 - PAGE_SHIFT;
+ numentries <<= 20 - PAGE_SHIFT;
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
@@ -6006,7 +5535,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = memblock_virt_alloc_nopanic(size, 0);
+ table = alloc_bootmem_nopanic(size);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
@@ -6068,71 +5597,61 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
* @end_bitidx: The last bit of interest
* returns pageblock_bits flags
*/
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
+unsigned long get_pageblock_flags_group(struct page *page,
+ int start_bitidx, int end_bitidx)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long word;
+ unsigned long pfn, bitidx;
+ unsigned long flags = 0;
+ unsigned long value = 1;
zone = page_zone(page);
+ pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
- word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+ for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+ if (test_bit(bitidx + start_bitidx, bitmap))
+ flags |= value;
+
+ return flags;
}
/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @start_bitidx: The first bit of interest
* @end_bitidx: The last bit of interest
* @flags: The flags to set
*/
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
- unsigned long pfn,
- unsigned long end_bitidx,
- unsigned long mask)
+void set_pageblock_flags_group(struct page *page, unsigned long flags,
+ int start_bitidx, int end_bitidx)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
-
- BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ unsigned long pfn, bitidx;
+ unsigned long value = 1;
zone = page_zone(page);
+ pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
- word_bitidx = bitidx / BITS_PER_LONG;
- bitidx &= (BITS_PER_LONG-1);
-
- VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
-
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
+ VM_BUG_ON(pfn < zone->zone_start_pfn);
+ VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
- word = ACCESS_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
+ for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+ if (flags & value)
+ __set_bit(bitidx + start_bitidx, bitmap);
+ else
+ __clear_bit(bitidx + start_bitidx, bitmap);
}
/*
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
*
- * PageLRU check without isolation or lru_lock could race so that
+ * PageLRU check wihtout isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
* expect this function should be exact.
*/
@@ -6160,17 +5679,6 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
continue;
page = pfn_to_page(check);
-
- /*
- * Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
- * handle each tail page individually in migration.
- */
- if (PageHuge(page)) {
- iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
- continue;
- }
-
/*
* We can't use page_count without pin a page
* because another CPU can free compound page.
@@ -6228,7 +5736,8 @@ bool is_pageblock_removable_nolock(struct page *page)
zone = page_zone(page);
pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
+ if (zone->zone_start_pfn > pfn ||
+ zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
return !has_unmovable_pages(zone, page, 0, true);
@@ -6284,14 +5793,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CMA);
+ ret = migrate_pages(&cc->migratepages,
+ alloc_migrate_target,
+ 0, false, MIGRATE_SYNC,
+ MR_CMA);
}
- if (ret < 0) {
- putback_movable_pages(&cc->migratepages);
- return ret;
- }
- return 0;
+
+ putback_movable_pages(&cc->migratepages);
+ return ret > 0 ? 0 : ret;
}
/**
@@ -6324,7 +5833,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .mode = MIGRATE_SYNC,
+ .sync = true,
.ignore_skip_hint = true,
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -6436,18 +5945,32 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
-/*
- * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalulated.
- */
+static int __meminit __zone_pcp_update(void *data)
+{
+ struct zone *zone = data;
+ int cpu;
+ unsigned long batch = zone_batchsize(zone), flags;
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ pcp = &pset->pcp;
+
+ local_irq_save(flags);
+ if (pcp->count > 0)
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ drain_zonestat(zone, pset);
+ setup_pageset(pset, batch);
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
void __meminit zone_pcp_update(struct zone *zone)
{
- unsigned cpu;
- mutex_lock(&pcp_batch_high_lock);
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
- mutex_unlock(&pcp_batch_high_lock);
+ stop_machine(__zone_pcp_update, zone, NULL);
}
#endif
@@ -6479,7 +6002,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- unsigned int order, i;
+ int order, i;
unsigned long pfn;
unsigned long flags;
/* find the first valid pfn */
@@ -6531,7 +6054,7 @@ bool is_free_buddy_page(struct page *page)
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
- unsigned int order;
+ int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
@@ -6617,25 +6140,12 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
- if (reason)
- pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_page_flags(page->flags & badflags);
- }
mem_cgroup_print_bad_page(page);
}
-
-void dump_page(struct page *page, const char *reason)
-{
- dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3708264d283..6d757e3a872 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,9 +54,8 @@ static int __init alloc_node_page_cgroup(int nid)
table_size = sizeof(struct page_cgroup) * nr_pages;
- base = memblock_virt_alloc_try_nid_nopanic(
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
@@ -175,7 +174,7 @@ static void free_page_cgroup(void *addr)
}
}
-static void __free_page_cgroup(unsigned long pfn)
+void __free_page_cgroup(unsigned long pfn)
{
struct mem_section *ms;
struct page_cgroup *base;
@@ -188,9 +187,9 @@ static void __free_page_cgroup(unsigned long pfn)
ms->page_cgroup = NULL;
}
-static int __meminit online_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
+int __meminit online_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
{
unsigned long start, end, pfn;
int fail = 0;
@@ -223,8 +222,8 @@ static int __meminit online_page_cgroup(unsigned long start_pfn,
return -ENOMEM;
}
-static int __meminit offline_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+int __meminit offline_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
{
unsigned long start, end, pfn;
@@ -452,7 +451,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
* @ent: swap entry to be looked up.
*
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b0d49..78eee32ee48 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,6 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
-#include <linux/aio.h>
-#include <linux/blkdev.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -31,19 +29,20 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
- bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_iter.bi_size = PAGE_SIZE;
+ bio->bi_idx = 0;
+ bio->bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
}
-void end_swap_bio_write(struct bio *bio, int err)
+static void end_swap_bio_write(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
@@ -62,7 +61,7 @@ void end_swap_bio_write(struct bio *bio, int err)
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ (unsigned long long)bio->bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -80,55 +79,10 @@ void end_swap_bio_read(struct bio *bio, int err)
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
- goto out;
- }
-
- SetPageUptodate(page);
-
- /*
- * There is no guarantee that the page is in swap cache - the software
- * suspend code (at least) uses end_swap_bio_read() against a non-
- * swapcache page. So we must check PG_swapcache before proceeding with
- * this optimization.
- */
- if (likely(PageSwapCache(page))) {
- struct swap_info_struct *sis;
-
- sis = page_swap_info(page);
- if (sis->flags & SWP_BLKDEV) {
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- struct gendisk *disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
- unsigned long offset;
-
- entry.val = page_private(page);
- offset = swp_offset(entry);
-
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
- }
+ (unsigned long long)bio->bi_sector);
+ } else {
+ SetPageUptodate(page);
}
-
-out:
unlock_page(page);
bio_put(bio);
}
@@ -231,7 +185,9 @@ bad_bmap:
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret = 0;
+ struct bio *bio;
+ int ret = 0, rw = WRITE;
+ struct swap_info_struct *sis = page_swap_info(page);
if (try_to_free_swap(page)) {
unlock_page(page);
@@ -243,80 +199,34 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
goto out;
}
- ret = __swap_writepage(page, wbc, end_swap_bio_write);
-out:
- return ret;
-}
-
-static sector_t swap_page_sector(struct page *page)
-{
- return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
-}
-
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- void (*end_write_func)(struct bio *, int))
-{
- struct bio *bio;
- int ret, rw = WRITE;
- struct swap_info_struct *sis = page_swap_info(page);
if (sis->flags & SWP_FILE) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from = {
- .type = ITER_BVEC | WRITE,
- .count = PAGE_SIZE,
- .iov_offset = 0,
- .nr_segs = 1,
+ struct iovec iov = {
+ .iov_base = kmap(page),
+ .iov_len = PAGE_SIZE,
};
- from.bvec = &bv; /* older gcc versions are broken */
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
+ kiocb.ki_left = PAGE_SIZE;
kiocb.ki_nbytes = PAGE_SIZE;
- set_page_writeback(page);
unlock_page(page);
- ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
- &kiocb, &from,
- kiocb.ki_pos);
+ ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+ &kiocb, &iov,
+ kiocb.ki_pos, 1);
+ kunmap(page);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * rotate_reclaimable_page but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
- set_page_dirty(page);
- ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
- page_file_offset(page));
}
- end_page_writeback(page);
return ret;
}
- ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
- if (!ret) {
- count_vm_event(PSWPOUT);
- return 0;
- }
-
- ret = 0;
- bio = get_swap_bio(GFP_NOIO, page, end_write_func);
+ bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -339,8 +249,8 @@ int swap_readpage(struct page *page)
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageUptodate(page), page);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageUptodate(page));
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -357,13 +267,6 @@ int swap_readpage(struct page *page)
return ret;
}
- ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
- if (!ret) {
- count_vm_event(PSWPIN);
- return 0;
- }
-
- ret = 0;
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index d1473b2e948..383bdbb98b0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,7 +6,6 @@
#include <linux/page-isolation.h>
#include <linux/pageblock-flags.h>
#include <linux/memory.h>
-#include <linux/hugetlb.h>
#include "internal.h"
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -227,9 +226,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
- * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
- * are not aligned to pageblock_nr_pages.
- * Then we just check migratetype first.
+ * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
+ * is not aligned to pageblock_nr_pages.
+ * Then we just check pagetype fist.
*/
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
@@ -239,7 +238,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
if ((pfn < end_pfn) || !page)
return -EBUSY;
- /* Check all pages are free or marked as ISOLATED */
+ /* Check all pages are free or Marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
@@ -253,19 +252,6 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
{
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
- /*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
- */
- if (PageHuge(page)) {
- nodemask_t src = nodemask_of_node(page_to_nid(page));
- nodemask_t dst;
- nodes_complement(dst, src);
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node(page_to_nid(page), dst));
- }
-
if (PageHighMem(page))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c..35aa294656c 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -127,7 +127,28 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma;
+
+ /* We don't need vma lookup at all. */
+ if (!walk->hugetlb_entry)
+ return NULL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ vma = find_vma(walk->mm, addr);
+ if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
+ return vma;
+
+ return NULL;
+}
+
#else /* CONFIG_HUGETLB_PAGE */
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ return NULL;
+}
+
static int walk_hugetlb_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -177,53 +198,30 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (!walk->mm)
return -EINVAL;
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
-
pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma;
next = pgd_addr_end(addr, end);
/*
- * This function was not intended to be vma based.
- * But there are vma special cases to be handled:
- * - hugetlb vma's
- * - VM_PFNMAP vma's
+ * handle hugetlb vma individually because pagetable walk for
+ * the hugetlb page is dependent on the architecture and
+ * we can't handled it in the same manner as non-huge pages.
*/
- vma = find_vma(walk->mm, addr);
+ vma = hugetlb_vma(addr, walk);
if (vma) {
- /*
- * There are no page structures backing a VM_PFNMAP
- * range, so do not allow split_huge_page_pmd().
- */
- if ((vma->vm_start <= addr) &&
- (vma->vm_flags & VM_PFNMAP)) {
+ if (vma->vm_end < next)
next = vma->vm_end;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
/*
- * Handle hugetlb vma individually because pagetable
- * walk for the hugetlb page is dependent on the
- * architecture and we can't handled it in the same
- * manner as non-huge pages.
+ * Hugepage is very tightly coupled with vma, so
+ * walk through hugetlb entries within a given vma.
*/
- if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
- is_vm_hugetlb_page(vma)) {
- if (vma->vm_end < next)
- next = vma->vm_end;
- /*
- * Hugepage is very tightly coupled with vma,
- * so walk through hugetlb entries within a
- * given vma.
- */
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
+ err = walk_hugetlb_range(vma, addr, next, walk);
+ if (err)
+ break;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
}
if (pgd_none_or_clear_bad(pgd)) {
@@ -242,7 +240,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (err)
break;
pgd++;
- } while (addr = next, addr < end);
+ } while (addr = next, addr != end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 2ddf9a990db..8c8e08f3a69 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -102,11 +102,10 @@ struct pcpu_chunk {
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used before the sentry */
+ int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
void *data; /* chunk data */
- int first_free; /* no free below this */
bool immutable; /* no [de]population allowed */
unsigned long populated[]; /* populated bitmap */
};
@@ -357,11 +356,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
{
int new_alloc;
- if (chunk->map_alloc >= chunk->map_used + 3)
+ if (chunk->map_alloc >= chunk->map_used + 2)
return 0;
new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + 3)
+ while (new_alloc < chunk->map_used + 2)
new_alloc *= 2;
return new_alloc;
@@ -419,6 +418,48 @@ out_unlock:
}
/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks. If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * @chunk->map must have enough free slots to accommodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+ int head, int tail)
+{
+ int nr_extra = !!head + !!tail;
+
+ BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
+
+ /* insert new subblocks */
+ memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+ sizeof(chunk->map[0]) * (chunk->map_used - i));
+ chunk->map_used += nr_extra;
+
+ if (head) {
+ chunk->map[i + 1] = chunk->map[i] - head;
+ chunk->map[i++] = head;
+ }
+ if (tail) {
+ chunk->map[i++] -= tail;
+ chunk->map[i] = tail;
+ }
+}
+
+/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @size: wanted size in bytes
@@ -442,27 +483,19 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
- bool seen_free = false;
- int *p;
- for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
+ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+ bool is_last = i + 1 == chunk->map_used;
int head, tail;
- int this_size;
-
- off = *p;
- if (off & 1)
- continue;
/* extra for alignment requirement */
head = ALIGN(off, align) - off;
+ BUG_ON(i == 0 && head != 0);
- this_size = (p[1] & ~1) - off;
- if (this_size < head + size) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- max_contig = max(this_size, max_contig);
+ if (chunk->map[i] < 0)
+ continue;
+ if (chunk->map[i] < head + size) {
+ max_contig = max(chunk->map[i], max_contig);
continue;
}
@@ -472,59 +505,44 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
* than sizeof(int), which is very small but isn't too
* uncommon for percpu allocations.
*/
- if (head && (head < sizeof(int) || !(p[-1] & 1))) {
- *p = off += head;
- if (p[-1] & 1)
+ if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+ if (chunk->map[i - 1] > 0)
+ chunk->map[i - 1] += head;
+ else {
+ chunk->map[i - 1] -= head;
chunk->free_size -= head;
- else
- max_contig = max(*p - p[-1], max_contig);
- this_size -= head;
+ }
+ chunk->map[i] -= head;
+ off += head;
head = 0;
}
/* if tail is small, just keep it around */
- tail = this_size - head - size;
- if (tail < sizeof(int)) {
+ tail = chunk->map[i] - head - size;
+ if (tail < sizeof(int))
tail = 0;
- size = this_size - head;
- }
/* split if warranted */
if (head || tail) {
- int nr_extra = !!head + !!tail;
-
- /* insert new subblocks */
- memmove(p + nr_extra + 1, p + 1,
- sizeof(chunk->map[0]) * (chunk->map_used - i));
- chunk->map_used += nr_extra;
-
+ pcpu_split_block(chunk, i, head, tail);
if (head) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- *++p = off += head;
- ++i;
- max_contig = max(head, max_contig);
- }
- if (tail) {
- p[1] = off + size;
- max_contig = max(tail, max_contig);
+ i++;
+ off += head;
+ max_contig = max(chunk->map[i - 1], max_contig);
}
+ if (tail)
+ max_contig = max(chunk->map[i + 1], max_contig);
}
- if (!seen_free)
- chunk->first_free = i + 1;
-
/* update hint and mark allocated */
- if (i + 1 == chunk->map_used)
+ if (is_last)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig);
- chunk->free_size -= size;
- *p |= 1;
+ chunk->free_size -= chunk->map[i];
+ chunk->map[i] = -chunk->map[i];
pcpu_chunk_relocate(chunk, oslot);
return off;
@@ -552,50 +570,34 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{
int oslot = pcpu_chunk_slot(chunk);
- int off = 0;
- unsigned i, j;
- int to_free = 0;
- int *p;
-
- freeme |= 1; /* we are searching for <given offset, in use> pair */
-
- i = 0;
- j = chunk->map_used;
- while (i != j) {
- unsigned k = (i + j) / 2;
- off = chunk->map[k];
- if (off < freeme)
- i = k + 1;
- else if (off > freeme)
- j = k;
- else
- i = j = k;
- }
- BUG_ON(off != freeme);
+ int i, off;
- if (i < chunk->first_free)
- chunk->first_free = i;
+ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+ if (off == freeme)
+ break;
+ BUG_ON(off != freeme);
+ BUG_ON(chunk->map[i] > 0);
- p = chunk->map + i;
- *p = off &= ~1;
- chunk->free_size += (p[1] & ~1) - off;
+ chunk->map[i] = -chunk->map[i];
+ chunk->free_size += chunk->map[i];
- /* merge with next? */
- if (!(p[1] & 1))
- to_free++;
/* merge with previous? */
- if (i > 0 && !(p[-1] & 1)) {
- to_free++;
+ if (i > 0 && chunk->map[i - 1] >= 0) {
+ chunk->map[i - 1] += chunk->map[i];
+ chunk->map_used--;
+ memmove(&chunk->map[i], &chunk->map[i + 1],
+ (chunk->map_used - i) * sizeof(chunk->map[0]));
i--;
- p--;
}
- if (to_free) {
- chunk->map_used -= to_free;
- memmove(p + 1, p + 1 + to_free,
- (chunk->map_used - i) * sizeof(chunk->map[0]));
+ /* merge with next? */
+ if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+ chunk->map[i] += chunk->map[i + 1];
+ chunk->map_used--;
+ memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+ (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
}
- chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
+ chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
pcpu_chunk_relocate(chunk, oslot);
}
@@ -610,14 +612,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ kfree(chunk);
return NULL;
}
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[0] = 0;
- chunk->map[1] = pcpu_unit_size | 1;
- chunk->map_used = 1;
+ chunk->map[chunk->map_used++] = pcpu_unit_size;
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
@@ -713,16 +713,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
unsigned long flags;
void __percpu *ptr;
- /*
- * We want the lowest bit of offset available for in-use/free
- * indicator, so force >= 16bit alignment and make size even.
- */
- if (unlikely(align < 2))
- align = 2;
-
- if (unlikely(size & 1))
- size++;
-
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
@@ -1073,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
+ ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
if (!ptr)
return NULL;
ai = ptr;
@@ -1098,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- memblock_free_early(__pa(ai), ai->__ai_size);
+ free_bootmem(__pa(ai), ai->__ai_size);
}
/**
@@ -1256,12 +1246,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = memblock_virt_alloc(ai->nr_groups *
- sizeof(group_offsets[0]), 0);
- group_sizes = memblock_virt_alloc(ai->nr_groups *
- sizeof(group_sizes[0]), 0);
- unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
- unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
+ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
+ group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
+ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
+ unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1323,8 +1311,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = memblock_virt_alloc(
- pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
+ pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1335,7 +1322,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
- schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
+ schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->base_addr = base_addr;
schunk->map = smap;
@@ -1353,17 +1340,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
}
schunk->contig_hint = schunk->free_size;
- schunk->map[0] = 1;
- schunk->map[1] = ai->static_size;
- schunk->map_used = 1;
+ schunk->map[schunk->map_used++] = -ai->static_size;
if (schunk->free_size)
- schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
- else
- schunk->map[1] |= 1;
+ schunk->map[schunk->map_used++] = schunk->free_size;
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
+ dchunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
@@ -1372,10 +1355,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
- dchunk->map[0] = 1;
- dchunk->map[1] = pcpu_reserved_chunk_limit;
- dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
- dchunk->map_used = 2;
+ dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+ dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* link the first chunk in */
@@ -1645,7 +1626,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = memblock_virt_alloc_nopanic(areas_size, 0);
+ areas = alloc_bootmem_nopanic(areas_size);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -1705,10 +1686,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
max_distance += ai->unit_size;
/* warn if maximum distance is further than 75% of vmalloc space */
- if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+ if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
"space 0x%lx\n", max_distance,
- VMALLOC_TOTAL);
+ (unsigned long)(VMALLOC_END - VMALLOC_START));
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -1725,13 +1706,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
- if (areas[group])
- free_fn(areas[group],
- ai->groups[group].nr_units * ai->unit_size);
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- memblock_free_early(__pa(areas), areas_size);
+ free_bootmem(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1779,7 +1759,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = memblock_virt_alloc(pages_size, 0);
+ pages = alloc_bootmem(pages_size);
/* allocate pages */
j = 0;
@@ -1842,7 +1822,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- memblock_free_early(__pa(pages), pages_size);
+ free_bootmem(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1867,13 +1847,12 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return memblock_virt_alloc_from_nopanic(
- size, align, __pa(MAX_DMA_ADDRESS));
+ return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- memblock_free_early(__pa(ptr), size);
+ free_bootmem(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
@@ -1916,9 +1895,7 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = memblock_virt_alloc_from_nopanic(unit_size,
- PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
+ fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a8b91992593..0c8323fe6c8 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,30 +10,6 @@
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
-/*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none. Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
- */
-
-void pgd_clear_bad(pgd_t *pgd)
-{
- pgd_ERROR(*pgd);
- pgd_clear(pgd);
-}
-
-void pud_clear_bad(pud_t *pud)
-{
- pud_ERROR(*pud);
- pud_clear(pud);
-}
-
-void pmd_clear_bad(pmd_t *pmd)
-{
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
-}
-
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
* Only sets the access flags (dirty, accessed), as well as write
@@ -110,10 +86,9 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
- struct mm_struct *mm = (vma)->vm_mm;
pte_t pte;
- pte = ptep_get_and_clear(mm, address, ptep);
- if (pte_accessible(mm, pte))
+ pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+ if (pte_accessible(pte))
flush_tlb_page(vma, address);
return pte;
}
@@ -149,17 +124,16 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
- pgtable_t pgtable)
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
{
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(&mm->page_table_lock);
/* FIFO */
- if (!pmd_huge_pte(mm, pmdp))
+ if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
else
- list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
- pmd_huge_pte(mm, pmdp) = pgtable;
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -167,18 +141,18 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* no "address" argument so destroys page coloring of some arch */
-pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
{
pgtable_t pgtable;
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(&mm->page_table_lock);
/* FIFO */
- pgtable = pmd_huge_pte(mm, pmdp);
+ pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
- pmd_huge_pte(mm, pmdp) = NULL;
+ mm->pmd_huge_pte = NULL;
else {
- pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
struct page, lru);
list_del(&pgtable->lru);
}
@@ -192,9 +166,6 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t entry = *pmdp;
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afcd9e1..926b4664974 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -23,40 +23,129 @@
/**
* process_vm_rw_pages - read/write pages from task specified
- * @pages: array of pointers to pages we want to copy
+ * @task: task to read/write from
+ * @mm: mm for task
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @pa: address of page in task to start copying from/to
* @start_offset: offset in page to start copying from/to
* @len: number of bytes to copy
- * @iter: where to copy to/from locally
+ * @lvec: iovec array specifying where to copy to/from
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
* @vm_write: 0 means copy from, 1 means copy to
+ * @nr_pages_to_copy: number of pages to copy
+ * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success, error code otherwise
*/
-static int process_vm_rw_pages(struct page **pages,
- unsigned offset,
- size_t len,
- struct iov_iter *iter,
- int vm_write)
+static int process_vm_rw_pages(struct task_struct *task,
+ struct mm_struct *mm,
+ struct page **process_pages,
+ unsigned long pa,
+ unsigned long start_offset,
+ unsigned long len,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
+ int vm_write,
+ unsigned int nr_pages_to_copy,
+ ssize_t *bytes_copied)
{
- /* Do the copy for each page */
- while (len && iov_iter_count(iter)) {
- struct page *page = *pages++;
- size_t copy = PAGE_SIZE - offset;
- size_t copied;
+ int pages_pinned;
+ void *target_kaddr;
+ int pgs_copied = 0;
+ int j;
+ int ret;
+ ssize_t bytes_to_copy;
+ ssize_t rc = 0;
- if (copy > len)
- copy = len;
+ *bytes_copied = 0;
- if (vm_write) {
- copied = copy_page_from_iter(page, offset, copy, iter);
- set_page_dirty_lock(page);
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages_pinned = get_user_pages(task, mm, pa,
+ nr_pages_to_copy,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ if (pages_pinned != nr_pages_to_copy) {
+ rc = -EFAULT;
+ goto end;
+ }
+
+ /* Do the copy for each page */
+ for (pgs_copied = 0;
+ (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+ pgs_copied++) {
+ /* Make sure we have a non zero length iovec */
+ while (*lvec_current < lvec_cnt
+ && lvec[*lvec_current].iov_len == 0)
+ (*lvec_current)++;
+ if (*lvec_current == lvec_cnt)
+ break;
+
+ /*
+ * Will copy smallest of:
+ * - bytes remaining in page
+ * - bytes remaining in destination iovec
+ */
+ bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
+ len - *bytes_copied);
+ bytes_to_copy = min_t(ssize_t, bytes_to_copy,
+ lvec[*lvec_current].iov_len
+ - *lvec_offset);
+
+ target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
+
+ if (vm_write)
+ ret = copy_from_user(target_kaddr,
+ lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ bytes_to_copy);
+ else
+ ret = copy_to_user(lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ target_kaddr, bytes_to_copy);
+ kunmap(process_pages[pgs_copied]);
+ if (ret) {
+ *bytes_copied += bytes_to_copy - ret;
+ pgs_copied++;
+ rc = -EFAULT;
+ goto end;
+ }
+ *bytes_copied += bytes_to_copy;
+ *lvec_offset += bytes_to_copy;
+ if (*lvec_offset == lvec[*lvec_current].iov_len) {
+ /*
+ * Need to copy remaining part of page into the
+ * next iovec if there are any bytes left in page
+ */
+ (*lvec_current)++;
+ *lvec_offset = 0;
+ start_offset = (start_offset + bytes_to_copy)
+ % PAGE_SIZE;
+ if (start_offset)
+ pgs_copied--;
} else {
- copied = copy_page_to_iter(page, offset, copy, iter);
+ start_offset = 0;
+ }
+ }
+
+end:
+ if (vm_write) {
+ for (j = 0; j < pages_pinned; j++) {
+ if (j < pgs_copied)
+ set_page_dirty_lock(process_pages[j]);
+ put_page(process_pages[j]);
}
- len -= copied;
- if (copied < copy && iov_iter_count(iter))
- return -EFAULT;
- offset = 0;
+ } else {
+ for (j = 0; j < pages_pinned; j++)
+ put_page(process_pages[j]);
}
- return 0;
+
+ return rc;
}
/* Maximum number of pages kmalloc'd to hold struct page's during copy */
@@ -66,60 +155,67 @@ static int process_vm_rw_pages(struct page **pages,
* process_vm_rw_single_vec - read/write pages from task specified
* @addr: start memory address of target process
* @len: size of area to copy to/from
- * @iter: where to copy to/from locally
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
* @process_pages: struct pages area that can store at least
* nr_pages_to_copy struct page pointers
* @mm: mm for task
* @task: task to read/write from
* @vm_write: 0 means copy from, 1 means copy to
+ * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success or on failure error code
*/
static int process_vm_rw_single_vec(unsigned long addr,
unsigned long len,
- struct iov_iter *iter,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
struct page **process_pages,
struct mm_struct *mm,
struct task_struct *task,
- int vm_write)
+ int vm_write,
+ ssize_t *bytes_copied)
{
unsigned long pa = addr & PAGE_MASK;
unsigned long start_offset = addr - pa;
unsigned long nr_pages;
+ ssize_t bytes_copied_loop;
ssize_t rc = 0;
+ unsigned long nr_pages_copied = 0;
+ unsigned long nr_pages_to_copy;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
+ *bytes_copied = 0;
+
/* Work out address and page range required */
if (len == 0)
return 0;
nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
- while (!rc && nr_pages && iov_iter_count(iter)) {
- int pages = min(nr_pages, max_pages_per_loop);
- size_t bytes;
-
- /* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages = get_user_pages(task, mm, pa, pages,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
- if (pages <= 0)
- return -EFAULT;
-
- bytes = pages * PAGE_SIZE - start_offset;
- if (bytes > len)
- bytes = len;
+ while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+ nr_pages_to_copy = min(nr_pages - nr_pages_copied,
+ max_pages_per_loop);
- rc = process_vm_rw_pages(process_pages,
- start_offset, bytes, iter,
- vm_write);
- len -= bytes;
+ rc = process_vm_rw_pages(task, mm, process_pages, pa,
+ start_offset, len,
+ lvec, lvec_cnt,
+ lvec_current, lvec_offset,
+ vm_write, nr_pages_to_copy,
+ &bytes_copied_loop);
start_offset = 0;
- nr_pages -= pages;
- pa += pages * PAGE_SIZE;
- while (pages)
- put_page(process_pages[--pages]);
+ *bytes_copied += bytes_copied_loop;
+
+ if (rc < 0) {
+ return rc;
+ } else {
+ len -= bytes_copied_loop;
+ nr_pages_copied += nr_pages_to_copy;
+ pa += nr_pages_to_copy * PAGE_SIZE;
+ }
}
return rc;
@@ -132,7 +228,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
/**
* process_vm_rw_core - core of reading/writing pages from task specified
* @pid: PID of process to read/write from/to
- * @iter: where to copy to/from locally
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
@@ -141,7 +238,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
* return less bytes than expected if an error occurs during the copying
* process.
*/
-static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
+static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
+ unsigned long liovcnt,
const struct iovec *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
@@ -152,10 +250,13 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
struct mm_struct *mm;
unsigned long i;
ssize_t rc = 0;
+ ssize_t bytes_copied_loop;
+ ssize_t bytes_copied = 0;
unsigned long nr_pages = 0;
unsigned long nr_pages_iov;
+ unsigned long iov_l_curr_idx = 0;
+ size_t iov_l_curr_offset = 0;
ssize_t iov_len;
- size_t total_len = iov_iter_count(iter);
/*
* Work out how many pages of struct pages we're going to need
@@ -209,20 +310,24 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
goto put_task_struct;
}
- for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
+ for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
- iter, process_pages, mm, task, vm_write);
-
- /* copied = space before - space after */
- total_len -= iov_iter_count(iter);
-
- /* If we have managed to copy any data at all then
- we return the number of bytes copied. Otherwise
- we return the error code */
- if (total_len)
- rc = total_len;
+ lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+ process_pages, mm, task, vm_write, &bytes_copied_loop);
+ bytes_copied += bytes_copied_loop;
+ if (rc != 0) {
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (bytes_copied)
+ rc = bytes_copied;
+ goto put_mm;
+ }
+ }
+ rc = bytes_copied;
+put_mm:
mmput(mm);
put_task_struct:
@@ -258,7 +363,6 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
- struct iov_iter iter;
ssize_t rc;
if (flags != 0)
@@ -274,14 +378,13 @@ static ssize_t process_vm_rw(pid_t pid,
if (rc <= 0)
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
-
rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
free_iovecs:
if (iov_r != iovstack_r)
@@ -309,7 +412,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
#ifdef CONFIG_COMPAT
-static ssize_t
+asmlinkage ssize_t
compat_process_vm_rw(compat_pid_t pid,
const struct compat_iovec __user *lvec,
unsigned long liovcnt,
@@ -321,12 +424,17 @@ compat_process_vm_rw(compat_pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
- struct iov_iter iter;
ssize_t rc = -EFAULT;
if (flags != 0)
return -EINVAL;
+ if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
+ goto out;
+
+ if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
+ goto out;
+
if (vm_write)
rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
@@ -337,40 +445,44 @@ compat_process_vm_rw(compat_pid_t pid,
&iov_l);
if (rc <= 0)
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
&iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
if (iov_l != iovstack_l)
kfree(iov_l);
+
+out:
return rc;
}
-COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
+asmlinkage ssize_t
+compat_sys_process_vm_readv(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
{
return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
riovcnt, flags, 0);
}
-COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
+asmlinkage ssize_t
+compat_sys_process_vm_writev(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
{
return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
riovcnt, flags, 1);
diff --git a/mm/readahead.c b/mm/readahead.c
index 0ca36a7770b..7963f239123 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,7 +8,9 @@
*/
#include <linux/kernel.h>
+#include <linux/fs.h>
#include <linux/gfp.h>
+#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -18,8 +20,6 @@
#include <linux/syscalls.h>
#include <linux/file.h>
-#include "internal.h"
-
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
if (!trylock_page(page))
BUG();
page->mapping = mapping;
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0);
page->mapping = NULL;
unlock_page(page);
}
@@ -149,7 +149,8 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -178,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page))
+ if (page)
continue;
page = page_cache_alloc_readahead(mapping);
@@ -210,6 +211,8 @@ out:
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
+ int ret = 0;
+
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
@@ -223,23 +226,39 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);
- if (err < 0)
- return err;
-
+ if (err < 0) {
+ ret = err;
+ break;
+ }
+ ret += err;
offset += this_chunk;
nr_to_read -= this_chunk;
}
- return 0;
+ return ret;
}
-#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
*/
unsigned long max_sane_readahead(unsigned long nr)
{
- return min(nr, MAX_READAHEAD);
+ return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
+ + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
+}
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ int actual;
+
+ actual = __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+
+ return actual;
}
/*
@@ -332,7 +351,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_hole(mapping, offset - 1, max);
+ head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -352,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping,
size = count_history_pages(mapping, ra, offset, max);
/*
- * not enough history pages:
+ * no history pages:
* it could be a random read
*/
- if (size <= req_size)
+ if (!size)
return 0;
/*
@@ -366,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping,
size *= 2;
ra->start = offset;
- ra->size = min(size + req_size, max);
- ra->async_size = 1;
+ ra->size = get_init_ra_size(size + req_size, max);
+ ra->async_size = ra->size;
return 1;
}
@@ -382,7 +401,6 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
- pgoff_t prev_offset;
/*
* start of file
@@ -412,7 +430,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max);
+ start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -434,11 +452,8 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
- * trivial case: (offset - prev_offset) == 1
- * unaligned reads: (offset - prev_offset) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
- if (offset - prev_offset <= 1UL)
+ if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
goto initial_readahead;
/*
@@ -554,13 +569,14 @@ static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
- if (!mapping || !mapping->a_ops)
+ if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
- return force_page_cache_readahead(mapping, filp, index, nr);
+ force_page_cache_readahead(mapping, filp, index, nr);
+ return 0;
}
-SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
{
ssize_t ret;
struct fd f;
@@ -579,3 +595,10 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
}
return ret;
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+ return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index b7e94ebbd09..2c78f8cadc9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,10 +103,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
- might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
}
kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -192,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
if (unlikely(allocated))
put_anon_vma(allocated);
@@ -309,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma_unlock_write(anon_vma);
+ anon_vma_unlock(anon_vma);
return 0;
@@ -427,9 +426,8 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* above cannot corrupt).
*/
if (!page_mapped(page)) {
- rcu_read_unlock();
put_anon_vma(anon_vma);
- return NULL;
+ anon_vma = NULL;
}
out:
rcu_read_unlock();
@@ -479,9 +477,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
if (!page_mapped(page)) {
- rcu_read_unlock();
put_anon_vma(anon_vma);
- return NULL;
+ anon_vma = NULL;
+ goto out;
}
/* we pinned the anon_vma, its safe to sleep */
@@ -569,7 +567,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -580,13 +577,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- /*
- * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
- */
- pmde = ACCESS_ONCE(*pmd);
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ if (!pmd_present(*pmd))
pmd = NULL;
out:
return pmd;
@@ -609,12 +600,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
- /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
- if (!pte)
- return NULL;
-
- ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ ptl = &mm->page_table_lock;
goto check;
}
@@ -622,6 +609,9 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
if (!pmd)
return NULL;
+ if (pmd_trans_huge(*pmd))
+ return NULL;
+
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
if (!sync && !pte_present(*pte)) {
@@ -666,47 +656,46 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
-struct page_referenced_arg {
- int mapcount;
- int referenced;
- unsigned long vm_flags;
- struct mem_cgroup *memcg;
-};
/*
- * arg: page_referenced_arg will be passed
+ * Subfunctions of page_referenced: page_referenced_one called
+ * repeatedly from either page_referenced_anon or page_referenced_file.
*/
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *mapcount,
+ unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
int referenced = 0;
- struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
+ spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
- return SWAP_AGAIN;
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (!pmd) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ spin_unlock(&mm->page_table_lock);
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
}
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
} else {
pte_t *pte;
+ spinlock_t *ptl;
/*
* rmap might return false positives; we must filter
@@ -714,12 +703,13 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
*/
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
- return SWAP_AGAIN;
+ goto out;
if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
}
if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -730,33 +720,119 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
* mapping is already gone, the unmap path will have
* set PG_referenced or activated the page.
*/
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (likely(!VM_SequentialReadHint(vma)))
referenced++;
}
pte_unmap_unlock(pte, ptl);
}
- if (referenced) {
- pra->referenced++;
- pra->vm_flags |= vma->vm_flags;
- }
+ (*mapcount)--;
- pra->mapcount--;
- if (!pra->mapcount)
- return SWAP_SUCCESS; /* To break the loop */
+ if (referenced)
+ *vm_flags |= vma->vm_flags;
+out:
+ return referenced;
+}
- return SWAP_AGAIN;
+static int page_referenced_anon(struct page *page,
+ struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
+{
+ unsigned int mapcount;
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff;
+ struct anon_vma_chain *avc;
+ int referenced = 0;
+
+ anon_vma = page_lock_anon_vma_read(page);
+ if (!anon_vma)
+ return referenced;
+
+ mapcount = page_mapcount(page);
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long address = vma_address(page, vma);
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+ continue;
+ referenced += page_referenced_one(page, vma, address,
+ &mapcount, vm_flags);
+ if (!mapcount)
+ break;
+ }
+
+ page_unlock_anon_vma_read(anon_vma);
+ return referenced;
}
-static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+/**
+ * page_referenced_file - referenced check for object-based rmap
+ * @page: the page we're checking references on.
+ * @memcg: target memory control group
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ *
+ * For an object-based mapped page, find all the places it is mapped and
+ * check/clear the referenced flag. This is done by following the page->mapping
+ * pointer, then walking the chain of vmas it holds. It returns the number
+ * of references it found.
+ *
+ * This function is only called from page_referenced for object-based pages.
+ */
+static int page_referenced_file(struct page *page,
+ struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
{
- struct page_referenced_arg *pra = arg;
- struct mem_cgroup *memcg = pra->memcg;
+ unsigned int mapcount;
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ int referenced = 0;
- if (!mm_match_cgroup(vma->vm_mm, memcg))
- return true;
+ /*
+ * The caller's checks on page->mapping and !PageAnon have made
+ * sure that this is a file page: the check for page->mapping
+ * excludes the case just before it gets set on an anon page.
+ */
+ BUG_ON(PageAnon(page));
- return false;
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_mutex.
+ */
+ BUG_ON(!PageLocked(page));
+
+ mutex_lock(&mapping->i_mmap_mutex);
+
+ /*
+ * i_mmap_mutex does not stabilize mapcount at all, but mapcount
+ * is more likely to be accurate if we note it after spinning.
+ */
+ mapcount = page_mapcount(page);
+
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ unsigned long address = vma_address(page, vma);
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+ continue;
+ referenced += page_referenced_one(page, vma, address,
+ &mapcount, vm_flags);
+ if (!mapcount)
+ break;
+ }
+
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return referenced;
}
/**
@@ -774,57 +850,44 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int ret;
+ int referenced = 0;
int we_locked = 0;
- struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
- .memcg = memcg,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_referenced_one,
- .arg = (void *)&pra,
- .anon_lock = page_lock_anon_vma_read,
- };
*vm_flags = 0;
- if (!page_mapped(page))
- return 0;
-
- if (!page_rmapping(page))
- return 0;
-
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
- if (!we_locked)
- return 1;
- }
-
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg) {
- rwc.invalid_vma = invalid_page_referenced_vma;
+ if (page_mapped(page) && page_rmapping(page)) {
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked) {
+ referenced++;
+ goto out;
+ }
+ }
+ if (unlikely(PageKsm(page)))
+ referenced += page_referenced_ksm(page, memcg,
+ vm_flags);
+ else if (PageAnon(page))
+ referenced += page_referenced_anon(page, memcg,
+ vm_flags);
+ else if (page->mapping)
+ referenced += page_referenced_file(page, memcg,
+ vm_flags);
+ if (we_locked)
+ unlock_page(page);
+
+ if (page_test_and_clear_young(page_to_pfn(page)))
+ referenced++;
}
-
- ret = rmap_walk(page, &rwc);
- *vm_flags = pra.vm_flags;
-
- if (we_locked)
- unlock_page(page);
-
- return pra.referenced;
+out:
+ return referenced;
}
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+ unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
- int *cleaned = arg;
pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
@@ -843,44 +906,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
- if (ret) {
+ if (ret)
mmu_notifier_invalidate_page(mm, address);
- (*cleaned)++;
- }
out:
- return SWAP_AGAIN;
+ return ret;
}
-static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
+static int page_mkclean_file(struct address_space *mapping, struct page *page)
{
- if (vma->vm_flags & VM_SHARED)
- return false;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ BUG_ON(PageAnon(page));
- return true;
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_flags & VM_SHARED) {
+ unsigned long address = vma_address(page, vma);
+ ret += page_mkclean_one(page, vma, address);
+ }
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return ret;
}
int page_mkclean(struct page *page)
{
- int cleaned = 0;
- struct address_space *mapping;
- struct rmap_walk_control rwc = {
- .arg = (void *)&cleaned,
- .rmap_one = page_mkclean_one,
- .invalid_vma = invalid_mkclean_vma,
- };
+ int ret = 0;
BUG_ON(!PageLocked(page));
- if (!page_mapped(page))
- return 0;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- rmap_walk(page, &rwc);
+ if (page_mapped(page)) {
+ struct address_space *mapping = page_mapping(page);
+ if (mapping)
+ ret = page_mkclean_file(mapping, page);
+ }
- return cleaned;
+ return ret;
}
EXPORT_SYMBOL_GPL(page_mkclean);
@@ -900,9 +963,9 @@ void page_move_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!anon_vma);
- VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
+ VM_BUG_ON(page->index != linear_page_index(vma, address));
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -992,22 +1055,16 @@ void do_page_add_anon_rmap(struct page *page,
{
int first = atomic_inc_and_test(&page->_mapcount);
if (first) {
- /*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption
- * disabled.
- */
- if (PageTransHuge(page))
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
}
if (unlikely(PageKsm(page)))
return;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
/* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
@@ -1031,30 +1088,15 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
- SetPageActive(page);
- lru_cache_add(page);
- return;
- }
-
- if (!TestSetPageMlocked(page)) {
- /*
- * We use the irq-unsafe __mod_zone_page_stat because this
- * counter is not modified from interrupt context, and the pte
- * lock is held(spinlock), which implies preemption disabled.
- */
- __mod_zone_page_state(page_zone(page), NR_MLOCK,
- hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
- }
- add_page_to_unevictable_list(page);
+ if (!mlocked_vma_newpage(vma, page))
+ lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+ else
+ add_page_to_unevictable_list(page);
}
/**
@@ -1071,7 +1113,7 @@ void page_add_file_rmap(struct page *page)
mem_cgroup_begin_update_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
@@ -1084,6 +1126,7 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
bool anon = PageAnon(page);
bool locked;
unsigned long flags;
@@ -1101,26 +1144,44 @@ void page_remove_rmap(struct page *page)
goto out;
/*
+ * Now that the last pte has gone, s390 must transfer dirty
+ * flag from storage key to struct page. We can usually skip
+ * this if the page is anon, so about to be freed; but perhaps
+ * not if it's in swapcache - there might be another pte slot
+ * containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking (note that this is not only an
+ * optimization but also solves problems caused by dirty flag in
+ * storage key getting set by a write from inside kernel); but need to
+ * catch shm and tmpfs and ramfs pages which have been modified since
+ * creation by read fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
+ */
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
+ page_test_and_clear_dirty(page_to_pfn(page), 1))
+ set_page_dirty(page);
+ /*
* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
* and not charged by memcg for now.
- *
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
- * these counters are not modified in interrupt context, and
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
*/
if (unlikely(PageHuge(page)))
goto out;
if (anon) {
mem_cgroup_uncharge_page(page);
- if (PageTransHuge(page))
+ if (!PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ else
__dec_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
if (unlikely(PageMlocked(page)))
@@ -1141,17 +1202,17 @@ out:
}
/*
- * @arg: enum ttu_flags will be passed to this argument
+ * Subfunctions of try_to_unmap: try_to_unmap_one called
+ * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, enum ttu_flags flags)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
- enum ttu_flags flags = (enum ttu_flags)arg;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1166,7 +1227,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED)
goto out_mlock;
- if (flags & TTU_MUNLOCK)
+ if (TTU_ACTION(flags) == TTU_MUNLOCK)
goto out_unmap;
}
if (!(flags & TTU_IGNORE_ACCESS)) {
@@ -1196,19 +1257,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
- } else if (pte_unused(pteval)) {
- /*
- * The guest indicated that the page content is of no
- * interest anymore. Simply discard the pte, vmscan
- * will take care of the rest.
- */
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
- pte_t swp_pte;
if (PageSwapCache(page)) {
/*
@@ -1234,16 +1284,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- BUG_ON(!(flags & TTU_MIGRATION));
+ BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
entry = make_migration_entry(page, pte_write(pteval));
}
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(mm, address, pte, swp_pte);
+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION)) {
+ (TTU_ACTION(flags) == TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
@@ -1256,7 +1303,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+ if (ret != SWAP_FAIL)
mmu_notifier_invalidate_page(mm, address);
out:
return ret;
@@ -1363,19 +1410,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- if (page == check_page) {
- /* we know we have check_page locked */
- mlock_vma_page(page);
+ mlock_vma_page(page); /* no-op if already mlocked */
+ if (page == check_page)
ret = SWAP_MLOCK;
- } else if (trylock_page(page)) {
- /*
- * If we can lock the page, perform mlock.
- * Otherwise leave the page alone, it will be
- * eventually encountered again later.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
continue; /* don't unmap */
}
@@ -1387,12 +1424,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
- if (page->index != linear_page_index(vma, address)) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(pteval))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, address, pte, ptfile);
- }
+ if (page->index != linear_page_index(vma, address))
+ set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -1410,9 +1443,93 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-static int try_to_unmap_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+/**
+ * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
+ * rmap method
+ * @page: the page to unmap/unlock
+ * @flags: action and flags
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * anonymous pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff;
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
+
+ anon_vma = page_lock_anon_vma_read(page);
+ if (!anon_vma)
+ return ret;
+
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long address;
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_vma_temporary_stack(vma))
+ continue;
+
+ address = vma_address(page, vma);
+ ret = try_to_unmap_one(page, vma, address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page))
+ break;
+ }
+
+ page_unlock_anon_vma_read(anon_vma);
+ return ret;
+}
+
+/**
+ * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
+ * @page: the page to unmap/unlock
+ * @flags: action and flags
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * object-based pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+{
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
unsigned long cursor;
@@ -1420,9 +1537,27 @@ static int try_to_unmap_nonlinear(struct page *page,
unsigned long max_nl_size = 0;
unsigned int mapcount;
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ unsigned long address = vma_address(page, vma);
+ ret = try_to_unmap_one(page, vma, address, flags);
+ if (ret != SWAP_AGAIN || !page_mapped(page))
+ goto out;
+ }
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto out;
+
+ /*
+ * We don't bother to try to find the munlocked page in nonlinears.
+ * It's costly. Instead, later, page reclaim logic may call
+ * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
+ */
+ if (TTU_ACTION(flags) == TTU_MUNLOCK)
+ goto out;
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1432,7 +1567,8 @@ static int try_to_unmap_nonlinear(struct page *page,
}
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- return SWAP_FAIL;
+ ret = SWAP_FAIL;
+ goto out;
}
/*
@@ -1444,8 +1580,7 @@ static int try_to_unmap_nonlinear(struct page *page,
*/
mapcount = page_mapcount(page);
if (!mapcount)
- return ret;
-
+ goto out;
cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1453,11 +1588,10 @@ static int try_to_unmap_nonlinear(struct page *page,
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
- while (cursor < max_nl_cursor &&
+ while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
if (try_to_unmap_cluster(cursor, &mapcount,
vma, page) == SWAP_MLOCK)
@@ -1465,7 +1599,7 @@ static int try_to_unmap_nonlinear(struct page *page,
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
- return ret;
+ goto out;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
@@ -1480,34 +1614,11 @@ static int try_to_unmap_nonlinear(struct page *page,
*/
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
-
+out:
+ mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
-static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
-{
- return is_vma_temporary_stack(vma);
-}
-
-static int page_not_mapped(struct page *page)
-{
- return !page_mapped(page);
-};
-
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
@@ -1525,29 +1636,16 @@ static int page_not_mapped(struct page *page)
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
- struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)flags,
- .done = page_not_mapped,
- .file_nonlinear = try_to_unmap_nonlinear,
- .anon_lock = page_lock_anon_vma_read,
- };
- VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-
- /*
- * During exec, a temporary VMA is setup and later moved.
- * The VMA is moved under the anon_vma lock but not the
- * page tables leading to a race where migration cannot
- * find the migration ptes. Rather than increasing the
- * locking requirements of exec(), migration skips
- * temporary VMAs until after exec() completes.
- */
- if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
- rwc.invalid_vma = invalid_migration_vma;
-
- ret = rmap_walk(page, &rwc);
+ BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
+ if (unlikely(PageKsm(page)))
+ ret = try_to_unmap_ksm(page, flags);
+ else if (PageAnon(page))
+ ret = try_to_unmap_anon(page, flags);
+ else
+ ret = try_to_unmap_file(page, flags);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
@@ -1570,43 +1668,38 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
*/
int try_to_munlock(struct page *page)
{
- int ret;
- struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
- .done = page_not_mapped,
- /*
- * We don't bother to try to find the munlocked page in
- * nonlinears. It's costly. Instead, later, page reclaim logic
- * may call try_to_unmap() and recover PG_mlocked lazily.
- */
- .file_nonlinear = NULL,
- .anon_lock = page_lock_anon_vma_read,
+ VM_BUG_ON(!PageLocked(page) || PageLRU(page));
- };
-
- VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
-
- ret = rmap_walk(page, &rwc);
- return ret;
+ if (unlikely(PageKsm(page)))
+ return try_to_unmap_ksm(page, TTU_MUNLOCK);
+ else if (PageAnon(page))
+ return try_to_unmap_anon(page, TTU_MUNLOCK);
+ else
+ return try_to_unmap_file(page, TTU_MUNLOCK);
}
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
- anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
+
+ anon_vma_free(anon_vma);
}
-static struct anon_vma *rmap_walk_anon_lock(struct page *page,
- struct rmap_walk_control *rwc)
+#ifdef CONFIG_MIGRATION
+/*
+ * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
+ * Called by migrate.c to remove migration ptes, but might be used more later.
+ */
+static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
-
- if (rwc->anon_lock)
- return rwc->anon_lock(page);
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1616,120 +1709,58 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
- return NULL;
-
- anon_vma_lock_read(anon_vma);
- return anon_vma;
-}
-
-/*
- * rmap_walk_anon - do something to anonymous page using the object-based
- * rmap method
- * @page: the page to be handled
- * @rwc: control variable according to each walk type
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
- */
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
-{
- struct anon_vma *anon_vma;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
-
- anon_vma = rmap_walk_anon_lock(page, rwc);
- if (!anon_vma)
return ret;
-
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
-
- if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
- continue;
-
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
+ ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
- if (rwc->done && rwc->done(page))
- break;
}
anon_vma_unlock_read(anon_vma);
return ret;
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @page: the page to be handled
- * @rwc: control variable according to each walk type
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
- */
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << compound_order(page);
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
- */
- VM_BUG_ON(!PageLocked(page));
-
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
-
- if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
- continue;
-
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
+ ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
- goto done;
- if (rwc->done && rwc->done(page))
- goto done;
+ break;
}
-
- if (!rwc->file_nonlinear)
- goto done;
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto done;
-
- ret = rwc->file_nonlinear(page, mapping, rwc->arg);
-
-done:
+ /*
+ * No nonlinear handling: being always shared, nonlinear vmas
+ * never contain migration ptes. Decide what to do about this
+ * limitation to linear when we need rmap_walk() on nonlinear.
+ */
mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
+ struct vm_area_struct *, unsigned long, void *), void *arg)
{
+ VM_BUG_ON(!PageLocked(page));
+
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rwc);
+ return rmap_walk_ksm(page, rmap_one, arg);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc);
+ return rmap_walk_anon(page, rmap_one, arg);
else
- return rmap_walk_file(page, rwc);
+ return rmap_walk_file(page, rmap_one, arg);
}
+#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_HUGETLB_PAGE
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 1140f49b6de..5dd56f6efdb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -25,13 +25,11 @@
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
-#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
-#include <linux/aio.h>
static struct vfsmount *shm_mnt;
@@ -45,7 +43,7 @@ static struct vfsmount *shm_mnt;
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
-#include <linux/posix_acl_xattr.h>
+#include <linux/generic_acl.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -80,12 +78,11 @@ static struct vfsmount *shm_mnt;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
- * a time): we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
- int mode; /* FALLOC_FL mode currently operating */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -243,17 +240,19 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
void **pslot;
- void *item;
+ void *item = NULL;
VM_BUG_ON(!expected);
- VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (!pslot)
- return -ENOENT;
- item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
+ if (pslot)
+ item = radix_tree_deref_slot_protected(pslot,
+ &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- radix_tree_replace_slot(pslot, replacement);
+ if (replacement)
+ radix_tree_replace_slot(pslot, replacement);
+ else
+ radix_tree_delete(&mapping->page_tree, index);
return 0;
}
@@ -284,8 +283,8 @@ static int shmem_add_to_page_cache(struct page *page,
{
int error;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageSwapBacked(page));
page_cache_get(page);
page->mapping = mapping;
@@ -330,20 +329,85 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
}
/*
+ * Like find_get_pages, but collecting swap entries as well as pages.
+ */
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_pages,
+ struct page **pages, pgoff_t *indices)
+{
+ unsigned int i;
+ unsigned int ret;
+ unsigned int nr_found;
+
+ rcu_read_lock();
+restart:
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, indices, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot((void **)pages[i]);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * Otherwise, we must be storing a swap entry
+ * here as an exceptional entry: so return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *((void **)pages[i]))) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = indices[i];
+ pages[ret] = page;
+ ret++;
+ }
+ if (unlikely(!ret && nr_found))
+ goto restart;
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
* Remove swap entry from radix tree, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
- void *old;
+ int error;
spin_lock_irq(&mapping->tree_lock);
- old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
+ error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
spin_unlock_irq(&mapping->tree_lock);
- if (old != radswap)
- return -ENOENT;
- free_swap_and_cache(radix_to_swp_entry(radswap));
- return 0;
+ if (!error)
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return error;
+}
+
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_deswap_pagevec(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
}
/*
@@ -364,12 +428,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
*/
- pvec.nr = find_get_entries(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
if (!pvec.nr)
break;
index = indices[pvec.nr - 1] + 1;
- pagevec_remove_exceptionals(&pvec);
+ shmem_deswap_pagevec(&pvec);
check_move_unevictable_pages(pvec.pages, pvec.nr);
pagevec_release(&pvec);
cond_resched();
@@ -401,9 +465,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pagevec_init(&pvec, 0);
index = start;
while (index < end) {
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
if (!pvec.nr)
break;
mem_cgroup_uncharge_start();
@@ -426,13 +490,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ VM_BUG_ON(PageWriteback(page));
truncate_inode_page(mapping, page);
}
}
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
+ shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -470,10 +534,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
for ( ; ; ) {
cond_resched();
-
- pvec.nr = find_get_entries(mapping, index,
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.pages, indices);
if (!pvec.nr) {
if (index == start || unfalloc)
break;
@@ -481,7 +544,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
}
if ((index == start || unfalloc) && indices[0] >= end) {
- pagevec_remove_exceptionals(&pvec);
+ shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
break;
}
@@ -504,13 +567,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
lock_page(page);
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ VM_BUG_ON(PageWriteback(page));
truncate_inode_page(mapping, page);
}
}
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
+ shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -556,8 +619,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
setattr_copy(inode, attr);
+#ifdef CONFIG_TMPFS_POSIX_ACL
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = generic_acl_chmod(inode);
+#endif
return error;
}
@@ -684,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -760,7 +825,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private;
if (shmem_falloc &&
- !shmem_falloc->mode &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped++;
@@ -1017,7 +1081,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
return -EFBIG;
repeat:
swap.val = 0;
- page = find_lock_entry(mapping, index);
+ page = find_lock_page(mapping, index);
if (radix_tree_exceptional_entry(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
@@ -1029,9 +1093,6 @@ repeat:
goto failed;
}
- if (page && sgp == SGP_WRITE)
- mark_page_accessed(page);
-
/* fallocated page? */
if (page && !PageUptodate(page)) {
if (sgp != SGP_READ)
@@ -1085,7 +1146,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_charge_file(page, current->mm,
+ error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1113,9 +1174,6 @@ repeat:
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- if (sgp == SGP_WRITE)
- mark_page_accessed(page);
-
delete_from_swap_cache(page);
set_page_dirty(page);
swap_free(swap);
@@ -1140,16 +1198,13 @@ repeat:
goto decused;
}
- __SetPageSwapBacked(page);
+ SetPageSwapBacked(page);
__set_page_locked(page);
- if (sgp == SGP_WRITE)
- init_page_accessed(page);
-
- error = mem_cgroup_charge_file(page, current->mm,
+ error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
goto decused;
- error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, NULL);
@@ -1240,48 +1295,10 @@ unlock:
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int error;
int ret = VM_FAULT_LOCKED;
- /*
- * Trinity finds that probing a hole which tmpfs is punching can
- * prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_mutex. So refrain from
- * faulting pages into the hole while it's being punched, and
- * wait on i_mutex to be released if vmf->flags permits.
- */
- if (unlikely(inode->i_private)) {
- struct shmem_falloc *shmem_falloc;
-
- spin_lock(&inode->i_lock);
- shmem_falloc = inode->i_private;
- if (!shmem_falloc ||
- shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
- vmf->pgoff < shmem_falloc->start ||
- vmf->pgoff >= shmem_falloc->next)
- shmem_falloc = NULL;
- spin_unlock(&inode->i_lock);
- /*
- * i_lock has protected us from taking shmem_falloc seriously
- * once return from shmem_fallocate() went back up that stack.
- * i_lock does not serialize with i_mutex at all, but it does
- * not matter if sometimes we wait unnecessarily, or sometimes
- * miss out on waiting: we just need to make those cases rare.
- */
- if (shmem_falloc) {
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- up_read(&vma->vm_mm->mmap_sem);
- mutex_lock(&inode->i_mutex);
- mutex_unlock(&inode->i_mutex);
- return VM_FAULT_RETRY;
- }
- /* cond_resched? Leave that to GUP or return to user */
- return VM_FAULT_NOPAGE;
- }
- }
-
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1296,14 +1313,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
unsigned long addr)
{
- struct inode *inode = file_inode(vma->vm_file);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
pgoff_t index;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1313,7 +1330,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file->f_path.dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
@@ -1401,11 +1418,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-bool shmem_mapping(struct address_space *mapping)
-{
- return mapping->backing_dev_info == &shmem_backing_dev_info;
-}
-
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -1451,17 +1463,13 @@ shmem_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
+ struct inode *inode = filp->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
enum sgp_type sgp = SGP_READ;
- int error = 0;
- ssize_t retval = 0;
- loff_t *ppos = &iocb->ki_pos;
/*
* Might this read be for a stacking filesystem? Then when reading
@@ -1489,10 +1497,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp, NULL);
- if (error) {
- if (error == -EINVAL)
- error = 0;
+ desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
+ if (desc->error) {
+ if (desc->error == -EINVAL)
+ desc->error = 0;
break;
}
if (page)
@@ -1536,26 +1544,61 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
+ *
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
*/
- ret = copy_page_to_iter(page, offset, nr, to);
- retval += ret;
+ ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
- if (!iov_iter_count(to))
- break;
- if (ret < nr) {
- error = -EFAULT;
+ if (ret != nr || !desc->count)
break;
- }
+
cond_resched();
}
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- file_accessed(file);
- return retval ? retval : error;
+ file_accessed(filp);
+}
+
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
+{
+ struct file *filp = iocb->ki_filp;
+ ssize_t retval;
+ unsigned long seg;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
+
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.arg.buf = iov[seg].iov_base;
+ desc.count = iov[seg].iov_len;
+ if (desc.count == 0)
+ continue;
+ desc.error = 0;
+ do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+ retval += desc.written;
+ if (desc.error) {
+ retval = retval ?: desc.error;
+ break;
+ }
+ if (desc.count > 0)
+ break;
+ }
+ return retval;
}
static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
@@ -1594,7 +1637,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, spd.nr_pages_max);
+ nr_pages = min(req_pages, pipe->buffers);
spd.nr_pages = find_get_pages_contig(mapping, index,
nr_pages, spd.pages);
@@ -1687,7 +1730,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pagevec_init(&pvec, 0);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
- pvec.nr = find_get_entries(mapping, index,
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
if (whence == SEEK_DATA)
@@ -1714,7 +1757,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
break;
}
}
- pagevec_remove_exceptionals(&pvec);
+ shmem_deswap_pagevec(&pvec);
pagevec_release(&pvec);
pvec.nr = PAGEVEC_SIZE;
cond_resched();
@@ -1754,8 +1797,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
}
}
- if (offset >= 0)
- offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
+ if (offset >= 0 && offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
mutex_unlock(&inode->i_mutex);
return offset;
}
@@ -1763,37 +1808,26 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file->f_path.dentry->d_inode;
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_falloc shmem_falloc;
pgoff_t start, index, end;
int error;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
- return -EOPNOTSUPP;
-
mutex_lock(&inode->i_mutex);
- shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
-
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
- shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
- spin_lock(&inode->i_lock);
- inode->i_private = &shmem_falloc;
- spin_unlock(&inode->i_lock);
-
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
error = 0;
- goto undone;
+ goto out;
}
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
@@ -1904,49 +1938,30 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
error = security_inode_init_security(inode, dir,
&dentry->d_name,
shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
-
+ if (error) {
+ if (error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
+ }
+ }
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ error = generic_acl_init(inode, dir);
+ if (error) {
+ iput(inode);
+ return error;
+ }
+#else
error = 0;
+#endif
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
return error;
-out_iput:
- iput(inode);
- return error;
-}
-
-static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- struct inode *inode;
- int error = -ENOSPC;
-
- inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
- if (inode) {
- error = security_inode_init_security(inode, dir,
- NULL,
- shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP)
- goto out_iput;
- error = simple_acl_create(dir, inode);
- if (error)
- goto out_iput;
- d_tmpfile(dentry, inode);
- }
- return error;
-out_iput:
- iput(inode);
- return error;
}
static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2178,8 +2193,8 @@ static int shmem_initxattrs(struct inode *inode,
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
+ &generic_acl_access_handler,
+ &generic_acl_default_handler,
#endif
NULL
};
@@ -2336,7 +2351,7 @@ static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
{
if (*len < 3) {
*len = 3;
- return FILEID_INVALID;
+ return 255;
}
if (inode_unhashed(inode)) {
@@ -2371,7 +2386,6 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
bool remount)
{
char *this_char, *value, *rest;
- struct mempolicy *mpol = NULL;
uid_t uid;
gid_t gid;
@@ -2400,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
printk(KERN_ERR
"tmpfs: No value for mount option '%s'\n",
this_char);
- goto error;
+ return 1;
}
if (!strcmp(this_char,"size")) {
@@ -2449,24 +2463,19 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (!gid_valid(sbinfo->gid))
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- mpol_put(mpol);
- mpol = NULL;
- if (mpol_parse_str(value, &mpol))
+ if (mpol_parse_str(value, &sbinfo->mpol))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
- goto error;
+ return 1;
}
}
- sbinfo->mpol = mpol;
return 0;
bad_val:
printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
-error:
- mpol_put(mpol);
return 1;
}
@@ -2478,7 +2487,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
unsigned long inodes;
int error = -EINVAL;
- config.mpol = NULL;
if (shmem_parse_options(data, &config, true))
return error;
@@ -2503,13 +2511,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
- /*
- * Preserve previous mempolicy unless mpol remount option was specified.
- */
- if (config.mpol) {
- mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
- }
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
out:
spin_unlock(&sbinfo->stat_lock);
return error;
@@ -2542,7 +2545,6 @@ static void shmem_put_super(struct super_block *sb)
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
percpu_counter_destroy(&sbinfo->used_blocks);
- mpol_put(sbinfo->mpol);
kfree(sbinfo);
sb->s_fs_info = NULL;
}
@@ -2570,15 +2572,13 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
* tmpfs instance, limiting inodes to one per page of lowmem;
* but the internal instance is left unlimited.
*/
- if (!(sb->s_flags & MS_KERNMOUNT)) {
+ if (!(sb->s_flags & MS_NOUSER)) {
sbinfo->max_blocks = shmem_default_max_blocks();
sbinfo->max_inodes = shmem_default_max_inodes();
if (shmem_parse_options(data, sbinfo, false)) {
err = -EINVAL;
goto failed;
}
- } else {
- sb->s_flags |= MS_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
sb->s_flags |= MS_NOSEC;
@@ -2677,13 +2677,13 @@ static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
- .read_iter = shmem_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = shmem_file_aio_read,
+ .aio_write = generic_file_aio_write,
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = generic_file_splice_write,
.fallocate = shmem_fallocate,
#endif
};
@@ -2695,7 +2695,6 @@ static const struct inode_operations shmem_inode_operations = {
.getxattr = shmem_getxattr,
.listxattr = shmem_listxattr,
.removexattr = shmem_removexattr,
- .set_acl = simple_set_acl,
#endif
};
@@ -2710,7 +2709,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
.rmdir = shmem_rmdir,
.mknod = shmem_mknod,
.rename = shmem_rename,
- .tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
@@ -2720,7 +2718,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
- .set_acl = simple_set_acl,
#endif
};
@@ -2733,7 +2730,6 @@ static const struct inode_operations shmem_special_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
- .set_acl = simple_set_acl,
#endif
};
@@ -2752,7 +2748,6 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
- .map_pages = filemap_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
@@ -2771,17 +2766,12 @@ static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.mount = shmem_mount,
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
};
int __init shmem_init(void)
{
int error;
- /* If rootfs called this, don't re-init */
- if (shmem_inode_cachep)
- return 0;
-
error = bdi_init(&shmem_backing_dev_info);
if (error)
goto out4;
@@ -2796,7 +2786,8 @@ int __init shmem_init(void)
goto out2;
}
- shm_mnt = kern_mount(&shmem_fs_type);
+ shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+ shmem_fs_type.name, NULL);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2826,11 +2817,12 @@ out4:
* effectively equivalent, but much lighter weight.
*/
+#include <linux/ramfs.h>
+
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
.mount = ramfs_mount,
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
};
int __init shmem_init(void)
@@ -2873,21 +2865,23 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
/* common code */
-static struct dentry_operations anon_ops = {
- .d_dname = simple_dname
-};
-
-static struct file *__shmem_file_setup(const char *name, loff_t size,
- unsigned long flags, unsigned int i_flags)
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
- struct file *res;
+ int error;
+ struct file *file;
struct inode *inode;
struct path path;
- struct super_block *sb;
+ struct dentry *root;
struct qstr this;
if (IS_ERR(shm_mnt))
- return ERR_CAST(shm_mnt);
+ return (void *)shm_mnt;
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
@@ -2895,68 +2889,43 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- res = ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
this.name = name;
this.len = strlen(name);
this.hash = 0; /* will go */
- sb = shm_mnt->mnt_sb;
- path.dentry = d_alloc_pseudo(sb, &this);
+ root = shm_mnt->mnt_root;
+ path.dentry = d_alloc(root, &this);
if (!path.dentry)
goto put_memory;
- d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(shm_mnt);
- res = ERR_PTR(-ENOSPC);
- inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
+ error = -ENOSPC;
+ inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
if (!inode)
goto put_dentry;
- inode->i_flags |= i_flags;
d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
- res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
- if (IS_ERR(res))
+#ifndef CONFIG_MMU
+ error = ramfs_nommu_expand_for_mapping(inode, size);
+ if (error)
goto put_dentry;
+#endif
- res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+ error = -ENFILE;
+ file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
&shmem_file_operations);
- if (IS_ERR(res))
+ if (!file)
goto put_dentry;
- return res;
+ return file;
put_dentry:
path_put(&path);
put_memory:
shmem_unacct_size(flags, size);
- return res;
-}
-
-/**
- * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
- * kernel internal. There will be NO LSM permission checks against the
- * underlying inode. So users of this interface must do LSM checks at a
- * higher layer. The one user is the big_key implementation. LSM checks
- * are provided at the key level rather than the inode level.
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
- */
-struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
-{
- return __shmem_file_setup(name, size, flags, S_PRIVATE);
-}
-
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
- */
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
-{
- return __shmem_file_setup(name, size, flags, 0);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
diff --git a/mm/slab.c b/mm/slab.c
index 3070b929a1b..e7667a3584b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -157,17 +157,6 @@
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
-#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
- <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
-
-#if FREELIST_BYTE_INDEX
-typedef unsigned char freelist_idx_t;
-#else
-typedef unsigned short freelist_idx_t;
-#endif
-
-#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
-
/*
* true if a page was allocated from pfmemalloc reserves for network-based
* swap
@@ -175,6 +164,72 @@ typedef unsigned short freelist_idx_t;
static bool pfmemalloc_active __read_mostly;
/*
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementation relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+ */
+
+typedef unsigned int kmem_bufctl_t;
+#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
+#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
+#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
+
+/*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU. This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking. We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ */
+struct slab_rcu {
+ struct rcu_head head;
+ struct kmem_cache *cachep;
+ void *addr;
+};
+
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ union {
+ struct {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+ };
+ struct slab_rcu __slab_cover_slab_rcu;
+ };
+};
+
+/*
* struct array_cache
*
* Purpose:
@@ -231,27 +286,68 @@ struct arraycache_init {
};
/*
+ * The slab lists for all objects.
+ */
+struct kmem_list3 {
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long free_objects;
+ unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
+ spinlock_t list_lock;
+ struct array_cache *shared; /* shared per node */
+ struct array_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
+};
+
+/*
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC MAX_NUMNODES
-#define SIZE_NODE (2 * MAX_NUMNODES)
+#define SIZE_L3 (2 * MAX_NUMNODES)
static int drain_freelist(struct kmem_cache *cache,
- struct kmem_cache_node *n, int tofree);
+ struct kmem_list3 *l3, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);
+/*
+ * This function must be completely optimized away if a constant is passed to
+ * it. Mostly the same as what is in linux/slab.h except it returns an index.
+ */
+static __always_inline int index_of(const size_t size)
+{
+ extern void __bad_size(void);
+
+ if (__builtin_constant_p(size)) {
+ int i = 0;
+
+#define CACHE(x) \
+ if (size <=x) \
+ return i; \
+ else \
+ i++;
+#include <linux/kmalloc_sizes.h>
+#undef CACHE
+ __bad_size();
+ } else
+ __bad_size();
+ return 0;
+}
+
static int slab_early_init = 1;
-#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
-#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
+#define INDEX_L3 index_of(sizeof(struct kmem_list3))
-static void kmem_cache_node_init(struct kmem_cache_node *parent)
+static void kmem_list3_init(struct kmem_list3 *parent)
{
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
@@ -267,7 +363,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define MAKE_LIST(cachep, listp, slab, nodeid) \
do { \
INIT_LIST_HEAD(listp); \
- list_splice(&(cachep->node[nodeid]->slab), listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
} while (0)
#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
@@ -288,8 +384,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
* OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
*/
-#define REAPTIMEOUT_AC (2*HZ)
-#define REAPTIMEOUT_NODE (4*HZ)
+#define REAPTIMEOUT_CPUC (2*HZ)
+#define REAPTIMEOUT_LIST3 (4*HZ)
#if STATS
#define STATS_INC_ACTIVE(x) ((x)->num_active++)
@@ -386,39 +482,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static void set_obj_status(struct page *page, int idx, int val)
-{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
-}
-
-static inline unsigned int get_obj_status(struct page *page, int idx)
-{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
-
- return status[idx];
-}
-
-#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
-
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -434,10 +497,18 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
return page->slab_cache;
}
-static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
+static inline struct slab *virt_to_slab(const void *obj)
+{
+ struct page *page = virt_to_head_page(obj);
+
+ VM_BUG_ON(!PageSlab(page));
+ return page->slab_page;
+}
+
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
unsigned int idx)
{
- return page->s_mem + cache->size * idx;
+ return slab->s_mem + cache->size * idx;
}
/*
@@ -447,12 +518,36 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
* reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
- const struct page *page, void *obj)
+ const struct slab *slab, void *obj)
{
- u32 offset = (obj - page->s_mem);
+ u32 offset = (obj - slab->s_mem);
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
+struct cache_sizes malloc_sizes[] = {
+#define CACHE(x) { .cs_size = (x) },
+#include <linux/kmalloc_sizes.h>
+ CACHE(ULONG_MAX)
+#undef CACHE
+};
+EXPORT_SYMBOL(malloc_sizes);
+
+/* Must match cache_sizes above. Out of line to keep cache footprint low. */
+struct cache_names {
+ char *name;
+ char *name_dma;
+};
+
+static struct cache_names __initdata cache_names[] = {
+#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
+#include <linux/kmalloc_sizes.h>
+ {NULL,}
+#undef CACHE
+};
+
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
@@ -491,15 +586,15 @@ static void slab_set_lock_classes(struct kmem_cache *cachep,
int q)
{
struct array_cache **alc;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
int r;
- n = cachep->node[q];
- if (!n)
+ l3 = cachep->nodelists[q];
+ if (!l3)
return;
- lockdep_set_class(&n->list_lock, l3_key);
- alc = n->alien;
+ lockdep_set_class(&l3->list_lock, l3_key);
+ alc = l3->alien;
/*
* FIXME: This check for BAD_ALIEN_MAGIC
* should go away when common slab code is taught to
@@ -530,30 +625,28 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
static void init_node_lock_keys(int q)
{
- int i;
+ struct cache_sizes *s = malloc_sizes;
if (slab_state < UP)
return;
- for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache_node *n;
- struct kmem_cache *cache = kmalloc_caches[i];
-
- if (!cache)
- continue;
+ for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
+ struct kmem_list3 *l3;
- n = cache->node[q];
- if (!n || OFF_SLAB(cache))
+ l3 = s->cs_cachep->nodelists[q];
+ if (!l3 || OFF_SLAB(s->cs_cachep))
continue;
- slab_set_lock_classes(cache, &on_slab_l3_key,
+ slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
&on_slab_alc_key, q);
}
}
static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
{
- if (!cachep->node[q])
+ struct kmem_list3 *l3;
+ l3 = cachep->nodelists[q];
+ if (!l3)
return;
slab_set_lock_classes(cachep, &on_slab_l3_key,
@@ -609,50 +702,44 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return cachep->array[smp_processor_id()];
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+ gfp_t gfpflags)
{
- size_t freelist_size;
+ struct cache_sizes *csizep = malloc_sizes;
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
+#if DEBUG
+ /* This happens if someone tries to call
+ * kmem_cache_create(), or __kmalloc(), before
+ * the generic caches are initialized.
+ */
+ BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
+#endif
+ if (!size)
+ return ZERO_SIZE_PTR;
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
+ while (size > csizep->cs_size)
+ csizep++;
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
/*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
+ * Really subtle: The last entry with cs->cs_size==ULONG_MAX
+ * has cs_{dma,}cachep==NULL. Thus no special case
+ * for large kmalloc calls required.
*/
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely(gfpflags & GFP_DMA))
+ return csizep->cs_dmacachep;
+#endif
+ return csizep->cs_cachep;
+}
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
+static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+{
+ return __find_general_cachep(size, gfpflags);
+}
- return nr_objs;
+static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+{
+ return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
}
/*
@@ -671,7 +758,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
+ * - The struct slab
+ * - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
@@ -684,10 +772,32 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
mgmt_size = 0;
nr_objs = slab_size / buffer_size;
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ /*
+ * Ignore padding for the initial guess. The padding
+ * is at most @align-1 bytes, and @buffer_size is at
+ * least @align. In the worst case, this result will
+ * be one greater than the number of objects that fit
+ * into the memory allocation when taking the padding
+ * into account.
+ */
+ nr_objs = (slab_size - sizeof(struct slab)) /
+ (buffer_size + sizeof(kmem_bufctl_t));
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+ if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+ > slab_size)
+ nr_objs--;
+
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
+
+ mgmt_size = slab_mgmt_size(nr_objs, align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -702,7 +812,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ add_taint(TAINT_BAD_PAGE);
}
#endif
@@ -775,7 +885,7 @@ static void next_reap_node(void)
* the CPUs getting into lockstep and contending for the global cache chain
* lock.
*/
-static void start_cpu_timer(int cpu)
+static void __cpuinit start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
@@ -817,8 +927,10 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
{
+ struct page *page = virt_to_page(slabp->s_mem);
+
return PageSlabPfmemalloc(page);
}
@@ -826,29 +938,29 @@ static inline bool is_slab_pfmemalloc(struct page *page)
static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
struct array_cache *ac)
{
- struct kmem_cache_node *n = cachep->node[numa_mem_id()];
- struct page *page;
+ struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+ struct slab *slabp;
unsigned long flags;
if (!pfmemalloc_active)
return;
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
+ list_for_each_entry(slabp, &l3->slabs_free, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
pfmemalloc_active = false;
out:
- spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&l3->list_lock, flags);
}
static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
@@ -859,7 +971,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
if (gfp_pfmemalloc_allowed(flags)) {
clear_obj_pfmemalloc(&objp);
@@ -881,10 +993,10 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
* If there are empty slabs on the slabs_free list and we are
* being forced to refill the cache, mark this one !pfmemalloc.
*/
- n = cachep->node[numa_mem_id()];
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
+ l3 = cachep->nodelists[numa_mem_id()];
+ if (!list_empty(&l3->slabs_free) && force_refill) {
+ struct slab *slabp = virt_to_slab(objp);
+ ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
clear_obj_pfmemalloc(&objp);
recheck_pfmemalloc_active(cachep, ac);
return objp;
@@ -959,7 +1071,7 @@ static int transfer_objects(struct array_cache *to,
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, n) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
@@ -1031,33 +1143,33 @@ static void free_alien_cache(struct array_cache **ac_ptr)
static void __drain_alien_cache(struct kmem_cache *cachep,
struct array_cache *ac, int node)
{
- struct kmem_cache_node *n = cachep->node[node];
+ struct kmem_list3 *rl3 = cachep->nodelists[node];
if (ac->avail) {
- spin_lock(&n->list_lock);
+ spin_lock(&rl3->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
* into the free lists and getting them back later.
*/
- if (n->shared)
- transfer_objects(n->shared, ac, ac->limit);
+ if (rl3->shared)
+ transfer_objects(rl3->shared, ac, ac->limit);
free_block(cachep, ac->entry, ac->avail, node);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ spin_unlock(&rl3->list_lock);
}
}
/*
* Called from cache_reap() to regularly drain alien caches round robin.
*/
-static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
int node = __this_cpu_read(slab_reap_node);
- if (n->alien) {
- struct array_cache *ac = n->alien[node];
+ if (l3->alien) {
+ struct array_cache *ac = l3->alien[node];
if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
__drain_alien_cache(cachep, ac, node);
@@ -1085,8 +1197,9 @@ static void drain_alien_cache(struct kmem_cache *cachep,
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- int nodeid = page_to_nid(virt_to_page(objp));
- struct kmem_cache_node *n;
+ struct slab *slabp = virt_to_slab(objp);
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3;
struct array_cache *alien = NULL;
int node;
@@ -1096,13 +1209,13 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(nodeid == node))
+ if (likely(slabp->nodeid == node))
return 0;
- n = cachep->node[node];
+ l3 = cachep->nodelists[node];
STATS_INC_NODEFREES(cachep);
- if (n->alien && n->alien[nodeid]) {
- alien = n->alien[nodeid];
+ if (l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
spin_lock(&alien->lock);
if (unlikely(alien->avail == alien->limit)) {
STATS_INC_ACOVERFLOW(cachep);
@@ -1111,70 +1224,64 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
ac_put_obj(cachep, alien, objp);
spin_unlock(&alien->lock);
} else {
- spin_lock(&(cachep->node[nodeid])->list_lock);
+ spin_lock(&(cachep->nodelists[nodeid])->list_lock);
free_block(cachep, &objp, 1, nodeid);
- spin_unlock(&(cachep->node[nodeid])->list_lock);
+ spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
}
return 1;
}
#endif
/*
- * Allocates and initializes node for a node on each slab cache, used for
- * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
* will be allocated off-node since memory is not yet online for the new node.
- * When hotplugging memory or a cpu, existing node are not replaced if
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
* already in use.
*
* Must hold slab_mutex.
*/
-static int init_cache_node_node(int node)
+static int init_cache_nodelists_node(int node)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n;
- const int memsize = sizeof(struct kmem_cache_node);
+ struct kmem_list3 *l3;
+ const int memsize = sizeof(struct kmem_list3);
list_for_each_entry(cachep, &slab_caches, list) {
/*
- * Set up the kmem_cache_node for cpu before we can
+ * Set up the size64 kmemlist for cpu before we can
* begin anything. Make sure some other cpu on this
* node has not already allocated this
*/
- if (!cachep->node[node]) {
- n = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!n)
+ if (!cachep->nodelists[node]) {
+ l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!l3)
return -ENOMEM;
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
/*
- * The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
+ * The l3s don't come and go as CPUs come and
+ * go. slab_mutex is sufficient
* protection here.
*/
- cachep->node[node] = n;
+ cachep->nodelists[node] = l3;
}
- spin_lock_irq(&cachep->node[node]->list_lock);
- cachep->node[node]->free_limit =
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
(1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->node[node]->list_lock);
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
}
return 0;
}
-static inline int slabs_tofree(struct kmem_cache *cachep,
- struct kmem_cache_node *n)
-{
- return (n->free_objects + cachep->num - 1) / cachep->num;
-}
-
-static void cpuup_canceled(long cpu)
+static void __cpuinit cpuup_canceled(long cpu)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n = NULL;
+ struct kmem_list3 *l3 = NULL;
int node = cpu_to_mem(cpu);
const struct cpumask *mask = cpumask_of_node(node);
@@ -1186,34 +1293,34 @@ static void cpuup_canceled(long cpu)
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
- n = cachep->node[node];
+ l3 = cachep->nodelists[node];
- if (!n)
+ if (!l3)
goto free_array_cache;
- spin_lock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
- /* Free limit for this kmem_cache_node */
- n->free_limit -= cachep->batchcount;
+ /* Free limit for this kmem_list3 */
+ l3->free_limit -= cachep->batchcount;
if (nc)
free_block(cachep, nc->entry, nc->avail, node);
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
goto free_array_cache;
}
- shared = n->shared;
+ shared = l3->shared;
if (shared) {
free_block(cachep, shared->entry,
shared->avail, node);
- n->shared = NULL;
+ l3->shared = NULL;
}
- alien = n->alien;
- n->alien = NULL;
+ alien = l3->alien;
+ l3->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
kfree(shared);
if (alien) {
@@ -1229,17 +1336,17 @@ free_array_cache:
* shrink each nodelist to its limit.
*/
list_for_each_entry(cachep, &slab_caches, list) {
- n = cachep->node[node];
- if (!n)
+ l3 = cachep->nodelists[node];
+ if (!l3)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, l3, l3->free_objects);
}
}
-static int cpuup_prepare(long cpu)
+static int __cpuinit cpuup_prepare(long cpu)
{
struct kmem_cache *cachep;
- struct kmem_cache_node *n = NULL;
+ struct kmem_list3 *l3 = NULL;
int node = cpu_to_mem(cpu);
int err;
@@ -1247,9 +1354,9 @@ static int cpuup_prepare(long cpu)
* We need to do this right in the beginning since
* alloc_arraycache's are going to use this list.
* kmalloc_node allows us to add the slab to the right
- * kmem_cache_node and not this cpu's kmem_cache_node
+ * kmem_list3 and not this cpu's kmem_list3
*/
- err = init_cache_node_node(node);
+ err = init_cache_nodelists_node(node);
if (err < 0)
goto bad;
@@ -1284,25 +1391,25 @@ static int cpuup_prepare(long cpu)
}
}
cachep->array[cpu] = nc;
- n = cachep->node[node];
- BUG_ON(!n);
+ l3 = cachep->nodelists[node];
+ BUG_ON(!l3);
- spin_lock_irq(&n->list_lock);
- if (!n->shared) {
+ spin_lock_irq(&l3->list_lock);
+ if (!l3->shared) {
/*
* We are serialised from CPU_DEAD or
* CPU_UP_CANCELLED by the cpucontrol lock
*/
- n->shared = shared;
+ l3->shared = shared;
shared = NULL;
}
#ifdef CONFIG_NUMA
- if (!n->alien) {
- n->alien = alien;
+ if (!l3->alien) {
+ l3->alien = alien;
alien = NULL;
}
#endif
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
kfree(shared);
free_alien_cache(alien);
if (cachep->flags & SLAB_DEBUG_OBJECTS)
@@ -1319,7 +1426,7 @@ bad:
return -ENOMEM;
}
-static int cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1357,9 +1464,9 @@ static int cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD_FROZEN:
/*
* Even if all the cpus of a node are down, we don't free the
- * kmem_cache_node of any cache. This to avoid a race between
+ * kmem_list3 of any cache. This to avoid a race between
* cpu_down, and a kmalloc allocation from another cpu for
- * memory from the node of the cpu going down. The node
+ * memory from the node of the cpu going down. The list3
* structure is usually allocated from kmem_cache_create() and
* gets destroyed at kmem_cache_destroy().
*/
@@ -1375,7 +1482,7 @@ static int cpuup_callback(struct notifier_block *nfb,
return notifier_from_errno(err);
}
-static struct notifier_block cpucache_notifier = {
+static struct notifier_block __cpuinitdata cpucache_notifier = {
&cpuup_callback, NULL, 0
};
@@ -1387,22 +1494,22 @@ static struct notifier_block cpucache_notifier = {
*
* Must hold slab_mutex.
*/
-static int __meminit drain_cache_node_node(int node)
+static int __meminit drain_cache_nodelists_node(int node)
{
struct kmem_cache *cachep;
int ret = 0;
list_for_each_entry(cachep, &slab_caches, list) {
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
- n = cachep->node[node];
- if (!n)
+ l3 = cachep->nodelists[node];
+ if (!l3)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, l3, l3->free_objects);
- if (!list_empty(&n->slabs_full) ||
- !list_empty(&n->slabs_partial)) {
+ if (!list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial)) {
ret = -EBUSY;
break;
}
@@ -1424,12 +1531,12 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
switch (action) {
case MEM_GOING_ONLINE:
mutex_lock(&slab_mutex);
- ret = init_cache_node_node(nid);
+ ret = init_cache_nodelists_node(nid);
mutex_unlock(&slab_mutex);
break;
case MEM_GOING_OFFLINE:
mutex_lock(&slab_mutex);
- ret = drain_cache_node_node(nid);
+ ret = drain_cache_nodelists_node(nid);
mutex_unlock(&slab_mutex);
break;
case MEM_ONLINE:
@@ -1444,49 +1551,49 @@ out:
#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
/*
- * swap the static kmem_cache_node with kmalloced memory
+ * swap the static kmem_list3 with kmalloced memory
*/
-static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
int nodeid)
{
- struct kmem_cache_node *ptr;
+ struct kmem_list3 *ptr;
- ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
BUG_ON(!ptr);
- memcpy(ptr, list, sizeof(struct kmem_cache_node));
+ memcpy(ptr, list, sizeof(struct kmem_list3));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
- cachep->node[nodeid] = ptr;
+ cachep->nodelists[nodeid] = ptr;
}
/*
- * For setting up all the kmem_cache_node for cache whose buffer_size is same as
- * size of kmem_cache_node.
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
*/
-static void __init set_up_node(struct kmem_cache *cachep, int index)
+static void __init set_up_list3s(struct kmem_cache *cachep, int index)
{
int node;
for_each_online_node(node) {
- cachep->node[node] = &init_kmem_cache_node[index + node];
- cachep->node[node]->next_reap = jiffies +
- REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ cachep->nodelists[node] = &initkmem_list3[index + node];
+ cachep->nodelists[node]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
}
}
/*
* The memory after the last cpu cache pointer is used for the
- * the node pointer.
+ * the nodelists pointer.
*/
-static void setup_node_pointer(struct kmem_cache *cachep)
+static void setup_nodelists_pointer(struct kmem_cache *cachep)
{
- cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
}
/*
@@ -1495,20 +1602,20 @@ static void setup_node_pointer(struct kmem_cache *cachep)
*/
void __init kmem_cache_init(void)
{
+ struct cache_sizes *sizes;
+ struct cache_names *names;
int i;
- BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
- sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
- setup_node_pointer(kmem_cache);
+ setup_nodelists_pointer(kmem_cache);
if (num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++)
- kmem_cache_node_init(&init_kmem_cache_node[i]);
+ kmem_list3_init(&initkmem_list3[i]);
- set_up_node(kmem_cache, CACHE_CACHE);
+ set_up_list3s(kmem_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
@@ -1524,7 +1631,7 @@ void __init kmem_cache_init(void)
* kmem_cache structures of all caches, except kmem_cache itself:
* kmem_cache is statically allocated.
* Initially an __init data area is used for the head array and the
- * kmem_cache_node structures, it's replaced with a kmalloc allocated
+ * kmem_list3 structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
* The struct kmem_cache for the new cache is allocated normally.
@@ -1533,7 +1640,7 @@ void __init kmem_cache_init(void)
* head arrays.
* 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_cache_node for kmem_cache and
+ * 5) Replace the __init data for kmem_list3 for kmem_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
@@ -1545,28 +1652,50 @@ void __init kmem_cache_init(void)
*/
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
- nr_node_ids * sizeof(struct kmem_cache_node *),
+ nr_node_ids * sizeof(struct kmem_list3 *),
SLAB_HWCACHE_ALIGN);
list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
+ sizes = malloc_sizes;
+ names = cache_names;
/*
* Initialize the caches that provide memory for the array cache and the
- * kmem_cache_node structures first. Without this, further allocations will
+ * kmem_list3 structures first. Without this, further allocations will
* bug.
*/
- kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
- kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
+ sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
- if (INDEX_AC != INDEX_NODE)
- kmalloc_caches[INDEX_NODE] =
- create_kmalloc_cache("kmalloc-node",
- kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
+ if (INDEX_AC != INDEX_L3)
+ sizes[INDEX_L3].cs_cachep =
+ create_kmalloc_cache(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
slab_early_init = 0;
+ while (sizes->cs_size != ULONG_MAX) {
+ /*
+ * For performance, all the general caches are L1 aligned.
+ * This should be particularly beneficial on SMP boxes, as it
+ * eliminates "false sharing".
+ * Note for systems short on memory removing the alignment will
+ * allow tighter packing of the smaller caches.
+ */
+ if (!sizes->cs_cachep)
+ sizes->cs_cachep = create_kmalloc_cache(names->name,
+ sizes->cs_size, ARCH_KMALLOC_FLAGS);
+
+#ifdef CONFIG_ZONE_DMA
+ sizes->cs_dmacachep = create_kmalloc_cache(
+ names->name_dma, sizes->cs_size,
+ SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
+#endif
+ sizes++;
+ names++;
+ }
/* 4) Replace the bootstrap head arrays */
{
struct array_cache *ptr;
@@ -1584,35 +1713,36 @@ void __init kmem_cache_init(void)
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
+ BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
!= &initarray_generic.cache);
- memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
+ memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
- kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
+ malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+ ptr;
}
- /* 5) Replace the bootstrap kmem_cache_node */
+ /* 5) Replace the bootstrap kmem_list3's */
{
int nid;
for_each_online_node(nid) {
- init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
+ init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
- init_list(kmalloc_caches[INDEX_AC],
- &init_kmem_cache_node[SIZE_AC + nid], nid);
+ init_list(malloc_sizes[INDEX_AC].cs_cachep,
+ &initkmem_list3[SIZE_AC + nid], nid);
- if (INDEX_AC != INDEX_NODE) {
- init_list(kmalloc_caches[INDEX_NODE],
- &init_kmem_cache_node[SIZE_NODE + nid], nid);
+ if (INDEX_AC != INDEX_L3) {
+ init_list(malloc_sizes[INDEX_L3].cs_cachep,
+ &initkmem_list3[SIZE_L3 + nid], nid);
}
}
}
- create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
+ slab_state = UP;
}
void __init kmem_cache_init_late(void)
@@ -1643,7 +1773,7 @@ void __init kmem_cache_init_late(void)
#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
- * node.
+ * nodelists.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
@@ -1673,16 +1803,10 @@ __initcall(cpucache_init);
static noinline void
slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
-#if DEBUG
- struct kmem_cache_node *n;
- struct page *page;
+ struct kmem_list3 *l3;
+ struct slab *slabp;
unsigned long flags;
int node;
- static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
- if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
- return;
printk(KERN_WARNING
"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
@@ -1694,24 +1818,24 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
unsigned long active_slabs = 0, num_slabs = 0;
- n = cachep->node[node];
- if (!n)
+ l3 = cachep->nodelists[node];
+ if (!l3)
continue;
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru) {
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each_entry(slabp, &l3->slabs_full, list) {
active_objs += cachep->num;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_partial, lru) {
- active_objs += page->active;
+ list_for_each_entry(slabp, &l3->slabs_partial, list) {
+ active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_free, lru)
+ list_for_each_entry(slabp, &l3->slabs_free, list)
num_slabs++;
- free_objects += n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ free_objects += l3->free_objects;
+ spin_unlock_irqrestore(&l3->list_lock, flags);
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
@@ -1720,7 +1844,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
-#endif
}
/*
@@ -1730,23 +1853,28 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
- int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
struct page *page;
int nr_pages;
+ int i;
+
+#ifndef CONFIG_MMU
+ /*
+ * Nommu uses slab's for process anonymous memory allocations, and thus
+ * requires __GFP_COMP to properly refcount higher order allocations
+ */
+ flags |= __GFP_COMP;
+#endif
flags |= cachep->allocflags;
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
- if (memcg_charge_slab(cachep, flags, cachep->gfporder))
- return NULL;
-
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
- memcg_uncharge_slab(cachep, cachep->gfporder);
- slab_out_of_memory(cachep, flags, nodeid);
+ if (!(flags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
@@ -1761,9 +1889,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
- __SetPageSlab(page);
- if (page->pfmemalloc)
- SetPageSlabPfmemalloc(page);
+ for (i = 0; i < nr_pages; i++) {
+ __SetPageSlab(page + i);
+
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page + i);
+ }
+ memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1774,15 +1906,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
kmemcheck_mark_unallocated_pages(page, nr_pages);
}
- return page;
+ return page_address(page);
}
/*
* Interface to system's page release.
*/
-static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ unsigned long i = (1 << cachep->gfporder);
+ struct page *page = virt_to_page(addr);
+ const unsigned long nr_freed = i;
kmemcheck_free_shadow(page, cachep->gfporder);
@@ -1792,28 +1926,27 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
else
sub_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_freed);
+ while (i--) {
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+ page++;
+ }
- BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
- page_mapcount_reset(page);
- page->mapping = NULL;
-
+ memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_pages(page, cachep->gfporder);
- memcg_uncharge_slab(cachep, cachep->gfporder);
+ free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
{
- struct kmem_cache *cachep;
- struct page *page;
-
- page = container_of(head, struct page, rcu_head);
- cachep = page->slab_cache;
+ struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
+ struct kmem_cache *cachep = slab_rcu->cachep;
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab_rcu->addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slab_rcu);
}
#if DEBUG
@@ -1907,9 +2040,11 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
- *dbg_userword(cachep, objp),
- *dbg_userword(cachep, objp));
+ printk(KERN_ERR "Last user: [<%p>]",
+ *dbg_userword(cachep, objp));
+ print_symbol("(%s)",
+ (unsigned long)*dbg_userword(cachep, objp));
+ printk("\n");
}
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
@@ -1962,19 +2097,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct page *page = virt_to_head_page(objp);
+ struct slab *slabp = virt_to_slab(objp);
unsigned int objnr;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slabp, objp);
if (objnr) {
- objp = index_to_obj(cachep, page, objnr - 1);
+ objp = index_to_obj(cachep, slabp, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = index_to_obj(cachep, page, objnr + 1);
+ objp = index_to_obj(cachep, slabp, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Next obj: start=%p, len=%d\n",
realobj, size);
@@ -1985,12 +2120,11 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#endif
#if DEBUG
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slabp, i);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -2015,8 +2149,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
}
#else
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
}
#endif
@@ -2024,40 +2157,29 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
- * @page: page pointer being destroyed
+ * @slabp: slab pointer being destroyed
*
* Destroy all the objs in a slab, and release the mem back to the system.
* Before calling the slab must have been unlinked from the cache. The
* cache-lock is not held/needed.
*/
-static void slab_destroy(struct kmem_cache *cachep, struct page *page)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
{
- void *freelist;
+ void *addr = slabp->s_mem - slabp->colouroff;
- freelist = page->freelist;
- slab_destroy_debugcheck(cachep, page);
+ slab_destroy_debugcheck(cachep, slabp);
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
- struct rcu_head *head;
-
- /*
- * RCU free overloads the RCU head over the LRU.
- * slab_page has been overloeaded over the LRU,
- * however it is not used from now on so that
- * we can use it safely.
- */
- head = (void *)&page->rcu_head;
- call_rcu(head, kmem_rcu_free);
+ struct slab_rcu *slab_rcu;
+ slab_rcu = (struct slab_rcu *)slabp;
+ slab_rcu->cachep = cachep;
+ slab_rcu->addr = addr;
+ call_rcu(&slab_rcu->head, kmem_rcu_free);
} else {
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slabp);
}
-
- /*
- * From now on, we don't use freelist
- * although actual page can be freed in rcu context
- */
- if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
}
/**
@@ -2088,21 +2210,14 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (!num)
continue;
- /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
- if (num > SLAB_OBJ_MAX_NUM)
- break;
-
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ offslab_limit = size - sizeof(struct slab);
+ offslab_limit /= sizeof(kmem_bufctl_t);
if (num > offslab_limit)
break;
@@ -2145,7 +2260,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
if (slab_state == DOWN) {
/*
* Note: Creation of first cache (kmem_cache).
- * The setup_node is taken care
+ * The setup_list3s is taken care
* of by the caller of __kmem_cache_create
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
@@ -2159,13 +2274,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
- * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
- * the second cache, then we need to set up all its node/,
+ * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+ * the second cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
- set_up_node(cachep, SIZE_AC);
- if (INDEX_AC == INDEX_NODE)
- slab_state = PARTIAL_NODE;
+ set_up_list3s(cachep, SIZE_AC);
+ if (INDEX_AC == INDEX_L3)
+ slab_state = PARTIAL_L3;
else
slab_state = PARTIAL_ARRAYCACHE;
} else {
@@ -2174,22 +2289,22 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
kmalloc(sizeof(struct arraycache_init), gfp);
if (slab_state == PARTIAL_ARRAYCACHE) {
- set_up_node(cachep, SIZE_NODE);
- slab_state = PARTIAL_NODE;
+ set_up_list3s(cachep, SIZE_L3);
+ slab_state = PARTIAL_L3;
} else {
int node;
for_each_online_node(node) {
- cachep->node[node] =
- kmalloc_node(sizeof(struct kmem_cache_node),
+ cachep->nodelists[node] =
+ kmalloc_node(sizeof(struct kmem_list3),
gfp, node);
- BUG_ON(!cachep->node[node]);
- kmem_cache_node_init(cachep->node[node]);
+ BUG_ON(!cachep->nodelists[node]);
+ kmem_list3_init(cachep->nodelists[node]);
}
}
}
- cachep->node[numa_mem_id()]->next_reap =
- jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ cachep->nodelists[numa_mem_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -2224,7 +2339,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size, ralign;
+ size_t left_over, slab_size, ralign;
gfp_t gfp;
int err;
size_t size = cachep->size;
@@ -2290,7 +2405,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
gfp = GFP_NOWAIT;
- setup_node_pointer(cachep);
+ setup_nodelists_pointer(cachep);
#if DEBUG
/*
@@ -2313,7 +2428,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= kmalloc_size(INDEX_NODE + 1)
+ if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
&& cachep->object_size > cache_line_size()
&& ALIGN(size, cachep->align) < PAGE_SIZE) {
cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
@@ -2328,7 +2443,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
+ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
@@ -2337,32 +2452,28 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
- /*
- * We should restrict the number of objects in a slab to implement
- * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
- */
- if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
- size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
if (!cachep->num)
return -E2BIG;
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
+ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ + sizeof(struct slab), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
+ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ left_over -= slab_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ slab_size =
+ cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2379,24 +2490,24 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ cachep->slab_size = slab_size;
cachep->flags = flags;
- cachep->allocflags = __GFP_COMP;
+ cachep->allocflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
+ cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
/*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
+ * This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
+ * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
+ * this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
err = setup_cpu_cache(cachep, gfp);
@@ -2434,7 +2545,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
+ assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
#endif
}
@@ -2442,7 +2553,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->node[node]->list_lock);
+ assert_spin_locked(&cachep->nodelists[node]->list_lock);
#endif
}
@@ -2453,7 +2564,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac,
int force, int node);
@@ -2465,29 +2576,29 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
- spin_lock(&cachep->node[node]->list_lock);
+ spin_lock(&cachep->nodelists[node]->list_lock);
free_block(cachep, ac->entry, ac->avail, node);
- spin_unlock(&cachep->node[node]->list_lock);
+ spin_unlock(&cachep->nodelists[node]->list_lock);
ac->avail = 0;
}
static void drain_cpu_caches(struct kmem_cache *cachep)
{
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
int node;
on_each_cpu(do_drain, cachep, 1);
check_irq_on();
for_each_online_node(node) {
- n = cachep->node[node];
- if (n && n->alien)
- drain_alien_cache(cachep, n->alien);
+ l3 = cachep->nodelists[node];
+ if (l3 && l3->alien)
+ drain_alien_cache(cachep, l3->alien);
}
for_each_online_node(node) {
- n = cachep->node[node];
- if (n)
- drain_array(cachep, n, n->shared, 1, node);
+ l3 = cachep->nodelists[node];
+ if (l3)
+ drain_array(cachep, l3, l3->shared, 1, node);
}
}
@@ -2498,66 +2609,88 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
* Returns the actual number of slabs released.
*/
static int drain_freelist(struct kmem_cache *cache,
- struct kmem_cache_node *n, int tofree)
+ struct kmem_list3 *l3, int tofree)
{
struct list_head *p;
int nr_freed;
- struct page *page;
+ struct slab *slabp;
nr_freed = 0;
- while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
+ while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
- spin_lock_irq(&n->list_lock);
- p = n->slabs_free.prev;
- if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
+ p = l3->slabs_free.prev;
+ if (p == &l3->slabs_free) {
+ spin_unlock_irq(&l3->list_lock);
goto out;
}
- page = list_entry(p, struct page, lru);
+ slabp = list_entry(p, struct slab, list);
#if DEBUG
- BUG_ON(page->active);
+ BUG_ON(slabp->inuse);
#endif
- list_del(&page->lru);
+ list_del(&slabp->list);
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
*/
- n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
- slab_destroy(cache, page);
+ l3->free_objects -= cache->num;
+ spin_unlock_irq(&l3->list_lock);
+ slab_destroy(cache, slabp);
nr_freed++;
}
out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+/* Called with slab_mutex held to protect against cpu hotplug */
+static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
drain_cpu_caches(cachep);
check_irq_on();
for_each_online_node(i) {
- n = cachep->node[i];
- if (!n)
+ l3 = cachep->nodelists[i];
+ if (!l3)
continue;
- drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ drain_freelist(cachep, l3, l3->free_objects);
- ret += !list_empty(&n->slabs_full) ||
- !list_empty(&n->slabs_partial);
+ ret += !list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial);
}
return (ret ? 1 : 0);
}
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+ BUG_ON(!cachep || in_interrupt());
+
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+ ret = __cache_shrink(cachep);
+ mutex_unlock(&slab_mutex);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
- struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep);
+ struct kmem_list3 *l3;
+ int rc = __cache_shrink(cachep);
if (rc)
return rc;
@@ -2565,13 +2698,13 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
for_each_online_cpu(i)
kfree(cachep->array[i]);
- /* NUMA: free the node structures */
+ /* NUMA: free the list3 structures */
for_each_online_node(i) {
- n = cachep->node[i];
- if (n) {
- kfree(n->shared);
- free_alien_cache(n->alien);
- kfree(n);
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
}
}
return 0;
@@ -2579,58 +2712,59 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
/*
* Get the memory for a slab management obj.
- *
- * For a slab cache when the slab descriptor is off-slab, the
- * slab descriptor can't come from the same cache which is being created,
- * Because if it is the case, that means we defer the creation of
- * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
- * And we eventually call down to __kmem_cache_create(), which
- * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
- * This is a "chicken-and-egg" problem.
- *
- * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
- * which are all initialized during kmem_cache_init().
+ * For a slab cache when the slab descriptor is off-slab, slab descriptors
+ * always come from malloc_sizes caches. The slab descriptor cannot
+ * come from the same cache which is getting created because,
+ * when we are searching for an appropriate cache for these
+ * descriptors in kmem_cache_create, we search through the malloc_sizes array.
+ * If we are creating a malloc_sizes cache here it would not be visible to
+ * kmem_find_general_cachep till the initialization is complete.
+ * Hence we cannot have slabp_cache same as the original cache.
*/
-static void *alloc_slabmgmt(struct kmem_cache *cachep,
- struct page *page, int colour_off,
- gfp_t local_flags, int nodeid)
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+ int colour_off, gfp_t local_flags,
+ int nodeid)
{
- void *freelist;
- void *addr = page_address(page);
+ struct slab *slabp;
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ slabp = kmem_cache_alloc_node(cachep->slabp_cache,
local_flags, nodeid);
- if (!freelist)
+ /*
+ * If the first object in the slab is leaked (it's allocated
+ * but no one has a reference to it), we want to make sure
+ * kmemleak does not treat the ->s_mem pointer as a reference
+ * to the object. Otherwise we will not report the leak.
+ */
+ kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+ local_flags);
+ if (!slabp)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ slabp = objp + colour_off;
+ colour_off += cachep->slab_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
- return freelist;
-}
-
-static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
-{
- return ((freelist_idx_t *)page->freelist)[idx];
+ slabp->inuse = 0;
+ slabp->colouroff = colour_off;
+ slabp->s_mem = objp + colour_off;
+ slabp->nodeid = nodeid;
+ slabp->free = 0;
+ return slabp;
}
-static inline void set_free_obj(struct page *page,
- unsigned int idx, freelist_idx_t val)
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
{
- ((freelist_idx_t *)(page->freelist))[idx] = val;
+ return (kmem_bufctl_t *) (slabp + 1);
}
static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slabp, i);
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2666,9 +2800,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
if (cachep->ctor)
cachep->ctor(objp);
#endif
- set_obj_status(page, i, OBJECT_FREE);
- set_free_obj(page, i, i);
+ slab_bufctl(slabp)[i] = i + 1;
}
+ slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2681,41 +2815,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
int nodeid)
{
- void *objp;
+ void *objp = index_to_obj(cachep, slabp, slabp->free);
+ kmem_bufctl_t next;
- objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
- page->active++;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ WARN_ON(slabp->nodeid != nodeid);
#endif
+ slabp->free = next;
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
void *objp, int nodeid)
{
- unsigned int objnr = obj_to_index(cachep, page, objp);
-#if DEBUG
- unsigned int i;
+ unsigned int objnr = obj_to_index(cachep, slabp, objp);
+#if DEBUG
/* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ WARN_ON(slabp->nodeid != nodeid);
- /* Verify double free bug */
- for (i = page->active; i < cachep->num; i++) {
- if (get_free_obj(page, i) == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
- BUG();
- }
+ if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
+ printk(KERN_ERR "slab: double free detected in cache "
+ "'%s', objp %p\n", cachep->name, objp);
+ BUG();
}
#endif
- page->active--;
- set_free_obj(page, page->active, objnr);
+ slab_bufctl(slabp)[objnr] = slabp->free;
+ slabp->free = objnr;
+ slabp->inuse--;
}
/*
@@ -2723,11 +2857,23 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
* for the slab allocator to be able to lookup the cache and slab of a
* virtual address for kfree, ksize, and slab debugging.
*/
-static void slab_map_pages(struct kmem_cache *cache, struct page *page,
- void *freelist)
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+ void *addr)
{
- page->slab_cache = cache;
- page->freelist = freelist;
+ int nr_pages;
+ struct page *page;
+
+ page = virt_to_page(addr);
+
+ nr_pages = 1;
+ if (likely(!PageCompound(page)))
+ nr_pages <<= cache->gfporder;
+
+ do {
+ page->slab_cache = cache;
+ page->slab_page = slab;
+ page++;
+ } while (--nr_pages);
}
/*
@@ -2735,12 +2881,12 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, struct page *page)
+ gfp_t flags, int nodeid, void *objp)
{
- void *freelist;
+ struct slab *slabp;
size_t offset;
gfp_t local_flags;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2749,17 +2895,17 @@ static int cache_grow(struct kmem_cache *cachep,
BUG_ON(flags & GFP_SLAB_BUG_MASK);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
- /* Take the node list lock to change the colour_next on this node */
+ /* Take the l3 list lock to change the colour_next on this node */
check_irq_off();
- n = cachep->node[nodeid];
- spin_lock(&n->list_lock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);
/* Get colour for the slab, and cal the next value. */
- offset = n->colour_next;
- n->colour_next++;
- if (n->colour_next >= cachep->colour)
- n->colour_next = 0;
- spin_unlock(&n->list_lock);
+ offset = l3->colour_next;
+ l3->colour_next++;
+ if (l3->colour_next >= cachep->colour)
+ l3->colour_next = 0;
+ spin_unlock(&l3->list_lock);
offset *= cachep->colour_off;
@@ -2778,34 +2924,34 @@ static int cache_grow(struct kmem_cache *cachep,
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- if (!page)
- page = kmem_getpages(cachep, local_flags, nodeid);
- if (!page)
+ if (!objp)
+ objp = kmem_getpages(cachep, local_flags, nodeid);
+ if (!objp)
goto failed;
/* Get slab management. */
- freelist = alloc_slabmgmt(cachep, page, offset,
+ slabp = alloc_slabmgmt(cachep, objp, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (!slabp)
goto opps1;
- slab_map_pages(cachep, page, freelist);
+ slab_map_pages(cachep, slabp, objp);
- cache_init_objs(cachep, page);
+ cache_init_objs(cachep, slabp);
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- spin_lock(&n->list_lock);
+ spin_lock(&l3->list_lock);
/* Make slab active. */
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num;
- spin_unlock(&n->list_lock);
+ l3->free_objects += cachep->num;
+ spin_unlock(&l3->list_lock);
return 1;
opps1:
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, objp);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
@@ -2853,8 +2999,9 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
- unsigned int objnr;
struct page *page;
+ unsigned int objnr;
+ struct slab *slabp;
BUG_ON(virt_to_cache(objp) != cachep);
@@ -2862,6 +3009,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
kfree_debugcheck(objp);
page = virt_to_head_page(objp);
+ slabp = page->slab_page;
+
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
@@ -2870,12 +3019,14 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slabp, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != index_to_obj(cachep, page, objnr));
+ BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2892,16 +3043,40 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
return objp;
}
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
+{
+ kmem_bufctl_t i;
+ int entries = 0;
+
+ /* Check slab's freelist to see if this obj is there. */
+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
+ entries++;
+ if (entries > cachep->num || i >= cachep->num)
+ goto bad;
+ }
+ if (entries != cachep->num - slabp->inuse) {
+bad:
+ printk(KERN_ERR "slab: Internal list corruption detected in "
+ "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse,
+ print_tainted());
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
+ sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
+ 1);
+ BUG();
+ }
+}
#else
#define kfree_debugcheck(x) do { } while(0)
#define cache_free_debugcheck(x,objp,z) (objp)
+#define check_slabp(x,y) do { } while(0)
#endif
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
struct array_cache *ac;
int node;
@@ -2920,30 +3095,31 @@ retry:
*/
batchcount = BATCHREFILL_LIMIT;
}
- n = cachep->node[node];
+ l3 = cachep->nodelists[node];
- BUG_ON(ac->avail > 0 || !n);
- spin_lock(&n->list_lock);
+ BUG_ON(ac->avail > 0 || !l3);
+ spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
- if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
- n->shared->touched = 1;
+ if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+ l3->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
struct list_head *entry;
- struct page *page;
+ struct slab *slabp;
/* Get slab alloc is to come from. */
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
goto must_grow;
}
- page = list_entry(entry, struct page, lru);
+ slabp = list_entry(entry, struct slab, list);
+ check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
@@ -2951,29 +3127,30 @@ retry:
* there must be at least one object available for
* allocation.
*/
- BUG_ON(page->active >= cachep->num);
+ BUG_ON(slabp->inuse >= cachep->num);
- while (page->active < cachep->num && batchcount--) {
+ while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
+ check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
+ list_del(&slabp->list);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &l3->slabs_full);
else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
- n->free_objects -= ac->avail;
+ l3->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
@@ -3009,8 +3186,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
@@ -3041,9 +3216,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ {
+ struct slab *slabp;
+ unsigned objnr;
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
+ slabp = virt_to_head_page(objp)->slab_page;
+ objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
+ slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+ }
+#endif
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
@@ -3111,7 +3293,7 @@ out:
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3126,7 +3308,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = mempolicy_slab_node();
+ nid_alloc = slab_node();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3135,7 +3317,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
/*
* Fallback function if there was no memory available and no objects on a
* certain node and fall back is permitted. First we scan all the
- * available node for available objects. If that fails then we
+ * available nodelists for available objects. If that fails then we
* perform an allocation without specifying a node. This allows the page
* allocator to do its reclaim / fallback magic. We then insert the
* slab into the proper nodelist and then allocate from it.
@@ -3157,8 +3339,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(), flags);
retry:
/*
@@ -3169,8 +3351,8 @@ retry:
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_hardwall(zone, flags) &&
- cache->node[nid] &&
- cache->node[nid]->free_objects) {
+ cache->nodelists[nid] &&
+ cache->nodelists[nid]->free_objects) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
if (obj)
@@ -3185,20 +3367,18 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- struct page *page;
-
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- page = kmem_getpages(cache, local_flags, numa_mem_id());
+ obj = kmem_getpages(cache, local_flags, numa_mem_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
- if (page) {
+ if (obj) {
/*
* Insert into the appropriate per node queues
*/
- nid = page_to_nid(page);
- if (cache_grow(cache, flags, nid, page)) {
+ nid = page_to_nid(virt_to_page(obj));
+ if (cache_grow(cache, flags, nid, obj)) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
if (!obj)
@@ -3215,7 +3395,7 @@ retry:
}
}
- if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
goto retry_cpuset;
return obj;
}
@@ -3227,50 +3407,51 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct list_head *entry;
- struct page *page;
- struct kmem_cache_node *n;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
void *obj;
int x;
- VM_BUG_ON(nodeid > num_online_nodes());
- n = cachep->node[nodeid];
- BUG_ON(!n);
+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
retry:
check_irq_off();
- spin_lock(&n->list_lock);
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
goto must_grow;
}
- page = list_entry(entry, struct page, lru);
+ slabp = list_entry(entry, struct slab, list);
check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
STATS_INC_NODEALLOCS(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- BUG_ON(page->active == cachep->num);
+ BUG_ON(slabp->inuse == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
- n->free_objects--;
+ obj = slab_get_obj(cachep, slabp, nodeid);
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&slabp->list);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &l3->slabs_full);
else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&slabp->list, &l3->slabs_partial);
- spin_unlock(&n->list_lock);
+ spin_unlock(&l3->list_lock);
goto done;
must_grow:
- spin_unlock(&n->list_lock);
+ spin_unlock(&l3->list_lock);
x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
if (x)
goto retry;
@@ -3281,6 +3462,18 @@ done:
return obj;
}
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ * @caller: return address of caller, used for debug information
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
+ */
static __always_inline void *
slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long caller)
@@ -3304,7 +3497,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (nodeid == NUMA_NO_NODE)
nodeid = slab_node;
- if (unlikely(!cachep->node[nodeid])) {
+ if (unlikely(!cachep->nodelists[nodeid])) {
/* Node not bootstrapped yet */
ptr = fallback_alloc(cachep, flags);
goto out;
@@ -3329,11 +3522,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
flags);
- if (likely(ptr)) {
+ if (likely(ptr))
kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+
+ if (unlikely((flags & __GFP_ZERO) && ptr))
+ memset(ptr, 0, cachep->object_size);
return ptr;
}
@@ -3343,7 +3536,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
- if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
+ if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
@@ -3394,59 +3587,61 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
flags);
prefetchw(objp);
- if (likely(objp)) {
+ if (likely(objp))
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+
+ if (unlikely((flags & __GFP_ZERO) && objp))
+ memset(objp, 0, cachep->object_size);
return objp;
}
/*
- * Caller needs to acquire correct kmem_cache_node's list_lock
+ * Caller needs to acquire correct kmem_list's list_lock
*/
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
{
int i;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
for (i = 0; i < nr_objects; i++) {
void *objp;
- struct page *page;
+ struct slab *slabp;
clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
- page = virt_to_head_page(objp);
- n = cachep->node[node];
- list_del(&page->lru);
+ slabp = virt_to_slab(objp);
+ l3 = cachep->nodelists[node];
+ list_del(&slabp->list);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ check_slabp(cachep, slabp);
+ slab_put_obj(cachep, slabp, objp, node);
STATS_DEC_ACTIVE(cachep);
- n->free_objects++;
+ l3->free_objects++;
+ check_slabp(cachep, slabp);
/* fixup slab chains */
- if (page->active == 0) {
- if (n->free_objects > n->free_limit) {
- n->free_objects -= cachep->num;
+ if (slabp->inuse == 0) {
+ if (l3->free_objects > l3->free_limit) {
+ l3->free_objects -= cachep->num;
/* No need to drop any previously held
* lock here, even if we have a off-slab slab
* descriptor it is guaranteed to come from
* a different cache, refer to comments before
* alloc_slabmgmt.
*/
- slab_destroy(cachep, page);
+ slab_destroy(cachep, slabp);
} else {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&slabp->list, &l3->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&slabp->list, &l3->slabs_partial);
}
}
}
@@ -3454,7 +3649,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
int node = numa_mem_id();
batchcount = ac->batchcount;
@@ -3462,10 +3657,10 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- n = cachep->node[node];
- spin_lock(&n->list_lock);
- if (n->shared) {
- struct array_cache *shared_array = n->shared;
+ l3 = cachep->nodelists[node];
+ spin_lock(&l3->list_lock);
+ if (l3->shared) {
+ struct array_cache *shared_array = l3->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
@@ -3484,12 +3679,12 @@ free_done:
int i = 0;
struct list_head *p;
- p = n->slabs_free.next;
- while (p != &(n->slabs_free)) {
- struct page *page;
+ p = l3->slabs_free.next;
+ while (p != &(l3->slabs_free)) {
+ struct slab *slabp;
- page = list_entry(p, struct page, lru);
- BUG_ON(page->active);
+ slabp = list_entry(p, struct slab, list);
+ BUG_ON(slabp->inuse);
i++;
p = p->next;
@@ -3497,7 +3692,7 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
@@ -3572,17 +3767,6 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#endif
#ifdef CONFIG_NUMA
-/**
- * kmem_cache_alloc_node - Allocate an object on the specified node
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- * @nodeid: node number of the target node.
- *
- * Identical to kmem_cache_alloc but it will allocate memory on the given
- * node, which can improve the performance for cpu bound structures.
- *
- * Fallback to other node is possible if __GFP_THISNODE is not set.
- */
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
@@ -3618,7 +3802,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
- cachep = kmalloc_slab(size, flags);
+ cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
return kmem_cache_alloc_node_trace(cachep, flags, node, size);
@@ -3658,7 +3842,12 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
struct kmem_cache *cachep;
void *ret;
- cachep = kmalloc_slab(size, flags);
+ /* If you want to save a few bytes .text space: replace
+ * __ with kmem_.
+ * Then kmalloc uses the uninlined functions instead of the inline
+ * functions.
+ */
+ cachep = __find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
ret = slab_alloc(cachep, flags, caller);
@@ -3747,12 +3936,12 @@ void kfree(const void *objp)
EXPORT_SYMBOL(kfree);
/*
- * This initializes kmem_cache_node or resizes various caches for all nodes.
+ * This initializes kmem_list3 or resizes various caches for all nodes.
*/
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
struct array_cache *new_shared;
struct array_cache **new_alien = NULL;
@@ -3775,43 +3964,43 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
}
}
- n = cachep->node[node];
- if (n) {
- struct array_cache *shared = n->shared;
+ l3 = cachep->nodelists[node];
+ if (l3) {
+ struct array_cache *shared = l3->shared;
- spin_lock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
if (shared)
free_block(cachep, shared->entry,
shared->avail, node);
- n->shared = new_shared;
- if (!n->alien) {
- n->alien = new_alien;
+ l3->shared = new_shared;
+ if (!l3->alien) {
+ l3->alien = new_alien;
new_alien = NULL;
}
- n->free_limit = (1 + nr_cpus_node(node)) *
+ l3->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
kfree(shared);
free_alien_cache(new_alien);
continue;
}
- n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
- if (!n) {
+ l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
+ if (!l3) {
free_alien_cache(new_alien);
kfree(new_shared);
goto fail;
}
- kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_NODE +
- ((unsigned long)cachep) % REAPTIMEOUT_NODE;
- n->shared = new_shared;
- n->alien = new_alien;
- n->free_limit = (1 + nr_cpus_node(node)) *
+ kmem_list3_init(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ l3->shared = new_shared;
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
- cachep->node[node] = n;
+ cachep->nodelists[node] = l3;
}
return 0;
@@ -3820,13 +4009,13 @@ fail:
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
- if (cachep->node[node]) {
- n = cachep->node[node];
+ if (cachep->nodelists[node]) {
+ l3 = cachep->nodelists[node];
- kfree(n->shared);
- free_alien_cache(n->alien);
- kfree(n);
- cachep->node[node] = NULL;
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ cachep->nodelists[node] = NULL;
}
node--;
}
@@ -3886,13 +4075,13 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
- spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
+ spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
- return alloc_kmem_cache_node(cachep, gfp);
+ return alloc_kmemlist(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3912,7 +4101,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(cachep, i);
+ c = cache_from_memcg(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
@@ -3989,11 +4178,11 @@ skip_setup:
}
/*
- * Drain an array if it contains any elements taking the node lock only if
- * necessary. Note that the node listlock also protects the array_cache
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac, int force, int node)
{
int tofree;
@@ -4003,7 +4192,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
if (ac->touched && !force) {
ac->touched = 0;
} else {
- spin_lock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
if (ac->avail) {
tofree = force ? ac->avail : (ac->limit + 4) / 5;
if (tofree > ac->avail)
@@ -4013,7 +4202,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void *) * ac->avail);
}
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
}
}
@@ -4032,7 +4221,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
static void cache_reap(struct work_struct *w)
{
struct kmem_cache *searchp;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
@@ -4044,33 +4233,33 @@ static void cache_reap(struct work_struct *w)
check_irq_on();
/*
- * We only take the node lock if absolutely necessary and we
+ * We only take the l3 lock if absolutely necessary and we
* have established with reasonable certainty that
* we can do some work if the lock was obtained.
*/
- n = searchp->node[node];
+ l3 = searchp->nodelists[node];
- reap_alien(searchp, n);
+ reap_alien(searchp, l3);
- drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
/*
* These are racy checks but it does not matter
* if we skip one check or scan twice.
*/
- if (time_after(n->next_reap, jiffies))
+ if (time_after(l3->next_reap, jiffies))
goto next;
- n->next_reap = jiffies + REAPTIMEOUT_NODE;
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
- drain_array(searchp, n, n->shared, 0, node);
+ drain_array(searchp, l3, l3->shared, 0, node);
- if (n->free_touched)
- n->free_touched = 0;
+ if (l3->free_touched)
+ l3->free_touched = 0;
else {
int freed;
- freed = drain_freelist(searchp, n, (n->free_limit +
+ freed = drain_freelist(searchp, l3, (l3->free_limit +
5 * searchp->num - 1) / (5 * searchp->num));
STATS_ADD_REAPED(searchp, freed);
}
@@ -4082,13 +4271,13 @@ next:
next_reap_node();
out:
/* Set up the next iteration */
- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
+ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
}
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- struct page *page;
+ struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
@@ -4096,42 +4285,42 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
const char *name;
char *error = NULL;
int node;
- struct kmem_cache_node *n;
+ struct kmem_list3 *l3;
active_objs = 0;
num_slabs = 0;
for_each_online_node(node) {
- n = cachep->node[node];
- if (!n)
+ l3 = cachep->nodelists[node];
+ if (!l3)
continue;
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru) {
- if (page->active != cachep->num && !error)
+ list_for_each_entry(slabp, &l3->slabs_full, list) {
+ if (slabp->inuse != cachep->num && !error)
error = "slabs_full accounting error";
active_objs += cachep->num;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_partial, lru) {
- if (page->active == cachep->num && !error)
- error = "slabs_partial accounting error";
- if (!page->active && !error)
- error = "slabs_partial accounting error";
- active_objs += page->active;
+ list_for_each_entry(slabp, &l3->slabs_partial, list) {
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_free, lru) {
- if (page->active && !error)
- error = "slabs_free accounting error";
+ list_for_each_entry(slabp, &l3->slabs_free, list) {
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
num_slabs++;
}
- free_objects += n->free_objects;
- if (n->shared)
- shared_avail += n->shared->avail;
+ free_objects += l3->free_objects;
+ if (l3->shared)
+ shared_avail += l3->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ spin_unlock_irq(&l3->list_lock);
}
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
@@ -4157,7 +4346,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
{
#if STATS
- { /* node stats */
+ { /* list3 stats */
unsigned long high = cachep->high_mark;
unsigned long allocs = cachep->num_allocations;
unsigned long grown = cachep->grown;
@@ -4276,18 +4465,15 @@ static inline int add_caller(unsigned long *n, unsigned long v)
return 1;
}
-static void handle_slab(unsigned long *n, struct kmem_cache *c,
- struct page *page)
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
{
void *p;
int i;
-
if (n[0] == n[1])
return;
- for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
+ if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
continue;
-
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
return;
}
@@ -4312,10 +4498,10 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
- struct page *page;
- struct kmem_cache_node *n;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
const char *name;
- unsigned long *x = m->private;
+ unsigned long *n = m->private;
int node;
int i;
@@ -4326,53 +4512,63 @@ static int leaks_show(struct seq_file *m, void *p)
/* OK, we can do it */
- x[1] = 0;
+ n[1] = 0;
for_each_online_node(node) {
- n = cachep->node[node];
- if (!n)
+ l3 = cachep->nodelists[node];
+ if (!l3)
continue;
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ spin_lock_irq(&l3->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ handle_slab(n, cachep, slabp);
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ handle_slab(n, cachep, slabp);
+ spin_unlock_irq(&l3->list_lock);
}
name = cachep->name;
- if (x[0] == x[1]) {
+ if (n[0] == n[1]) {
/* Increase the buffer size */
mutex_unlock(&slab_mutex);
- m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+ m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
if (!m->private) {
/* Too bad, we are really out */
- m->private = x;
+ m->private = n;
mutex_lock(&slab_mutex);
return -ENOMEM;
}
- *(unsigned long *)m->private = x[0] * 2;
- kfree(x);
+ *(unsigned long *)m->private = n[0] * 2;
+ kfree(n);
mutex_lock(&slab_mutex);
/* Now make sure this entry will be retried */
m->count = m->size;
return 0;
}
- for (i = 0; i < x[1]; i++) {
- seq_printf(m, "%s: %lu ", name, x[2*i+3]);
- show_symbol(m, x[2*i+2]);
+ for (i = 0; i < n[1]; i++) {
+ seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+ show_symbol(m, n[2*i+2]);
seq_putc(m, '\n');
}
return 0;
}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
static const struct seq_operations slabstats_op = {
.start = leaks_start,
- .next = slab_next,
- .stop = slab_stop,
+ .next = s_next,
+ .stop = s_stop,
.show = leaks_show,
};
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a..34a98d64219 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -16,7 +16,7 @@ enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
- PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
+ PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */
};
@@ -35,15 +35,6 @@ extern struct kmem_cache *kmem_cache;
unsigned long calculate_alignment(unsigned long flags,
unsigned long align, unsigned long size);
-#ifndef CONFIG_SLOB
-/* Kmalloc array related functions */
-void create_kmalloc_caches(unsigned long);
-
-/* Find the kmalloc slab corresponding for a certain size */
-struct kmem_cache *kmalloc_slab(size_t, gfp_t);
-#endif
-
-
/* Functions provided by the slab allocators */
extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
@@ -55,12 +46,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
struct mem_cgroup;
#ifdef CONFIG_SLUB
struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *));
#else
static inline struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
@@ -91,8 +82,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
-void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
struct file;
@@ -121,6 +110,28 @@ static inline bool is_root_cache(struct kmem_cache *s)
return !s->memcg_params || s->memcg_params->is_root_cache;
}
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
+{
+ return (is_root_cache(cachep) && !memcg) ||
+ (cachep->memcg_params->memcg == memcg);
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+ if (!is_root_cache(s))
+ atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+ if (is_root_cache(s))
+ return;
+
+ if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+ mem_cgroup_destroy_cache(s);
+}
+
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
@@ -140,36 +151,9 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-/*
- * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
- */
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
- struct kmem_cache *cachep;
- struct memcg_cache_params *params;
-
- if (!s->memcg_params)
- return NULL;
-
- rcu_read_lock();
- params = rcu_dereference(s->memcg_params);
- cachep = params->memcg_caches[idx];
- rcu_read_unlock();
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match this (see
- * memcg_register_cache()).
- */
- smp_read_barrier_depends();
- return cachep;
+ return s->memcg_params->memcg_caches[idx];
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
@@ -178,29 +162,24 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s;
return s->memcg_params->root_cache;
}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return true;
+}
-static __always_inline int memcg_charge_slab(struct kmem_cache *s,
- gfp_t gfp, int order)
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
{
- if (!memcg_kmem_enabled())
- return 0;
- if (is_root_cache(s))
- return 0;
- return __memcg_charge_slab(s, gfp, order);
+ return true;
}
-static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
- if (!memcg_kmem_enabled())
- return;
- if (is_root_cache(s))
- return;
- __memcg_uncharge_slab(s, order);
}
-#else
-static inline bool is_root_cache(struct kmem_cache *s)
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
{
- return true;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -214,8 +193,7 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
return NULL;
}
@@ -224,15 +202,6 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
-
-static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
-{
- return 0;
-}
-
-static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-}
#endif
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
@@ -261,38 +230,3 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
}
#endif
-
-
-/*
- * The slab lists for all objects.
- */
-struct kmem_cache_node {
- spinlock_t list_lock;
-
-#ifdef CONFIG_SLAB
- struct list_head slabs_partial; /* partial list first, better asm code */
- struct list_head slabs_full;
- struct list_head slabs_free;
- unsigned long free_objects;
- unsigned int free_limit;
- unsigned int colour_next; /* Per-node cache coloring */
- struct array_cache *shared; /* shared per node */
- struct array_cache **alien; /* on other nodes */
- unsigned long next_reap; /* updated without locking */
- int free_touched; /* updated without locking */
-#endif
-
-#ifdef CONFIG_SLUB
- unsigned long nr_partial;
- struct list_head partial;
-#ifdef CONFIG_SLUB_DEBUG
- atomic_long_t nr_slabs;
- atomic_long_t total_objects;
- struct list_head full;
-#endif
-#endif
-
-};
-
-void *slab_next(struct seq_file *m, void *p, loff_t *pos);
-void slab_stop(struct seq_file *m, void *p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 735e01a0db6..3f3cd97d3fd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -19,7 +19,6 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
-#include <trace/events/kmem.h>
#include "slab.h"
@@ -29,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+ size_t size)
{
struct kmem_cache *s = NULL;
@@ -55,22 +55,27 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
continue;
}
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
- if (!strcmp(s->name, name)) {
+ /*
+ * For simplicity, we won't check this in the list of memcg
+ * caches. We have control over memcg naming, and if there
+ * aren't duplicates in the global list, there won't be any
+ * duplicates in the memcg lists as well.
+ */
+ if (!memcg && !strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
s = NULL;
return -EINVAL;
}
-#endif
}
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
return 0;
}
#else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+ const char *name, size_t size)
{
return 0;
}
@@ -131,45 +136,6 @@ unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
-static struct kmem_cache *
-do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
-{
- struct kmem_cache *s;
- int err;
-
- err = -ENOMEM;
- s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (!s)
- goto out;
-
- s->name = name;
- s->object_size = object_size;
- s->size = size;
- s->align = align;
- s->ctor = ctor;
-
- err = memcg_alloc_cache_params(memcg, s, root_cache);
- if (err)
- goto out_free_cache;
-
- err = __kmem_cache_create(s, flags);
- if (err)
- goto out_free_cache;
-
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
-out:
- if (err)
- return ERR_PTR(err);
- return s;
-
-out_free_cache:
- memcg_free_cache_params(s);
- kfree(s);
- goto out;
-}
/*
* kmem_cache_create - Create a cache.
@@ -195,22 +161,20 @@ out_free_cache:
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
+
struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *),
+ struct kmem_cache *parent_cache)
{
- struct kmem_cache *s;
- char *cache_name;
- int err;
+ struct kmem_cache *s = NULL;
+ int err = 0;
get_online_cpus();
- get_online_mems();
-
mutex_lock(&slab_mutex);
- err = kmem_cache_sanity_check(name, size);
- if (err)
- goto out_unlock;
+ if (!kmem_cache_sanity_check(memcg, name, size) == 0)
+ goto out_locked;
/*
* Some allocators will constraint the set of valid flags to a subset
@@ -220,31 +184,47 @@ kmem_cache_create(const char *name, size_t size, size_t align,
*/
flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(name, size, align, flags, ctor);
+ s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
- goto out_unlock;
+ goto out_locked;
- cache_name = kstrdup(name, GFP_KERNEL);
- if (!cache_name) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (s) {
+ s->object_size = s->size = size;
+ s->align = calculate_alignment(flags, align, size);
+ s->ctor = ctor;
+
+ if (memcg_register_cache(memcg, s, parent_cache)) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
+ }
- s = do_kmem_cache_create(cache_name, size, size,
- calculate_alignment(flags, align, size),
- flags, ctor, NULL, NULL);
- if (IS_ERR(s)) {
- err = PTR_ERR(s);
- kfree(cache_name);
- }
+ s->name = kstrdup(name, GFP_KERNEL);
+ if (!s->name) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
+ }
-out_unlock:
- mutex_unlock(&slab_mutex);
+ err = __kmem_cache_create(s, flags);
+ if (!err) {
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_cache_list_add(memcg, s);
+ } else {
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ }
+ } else
+ err = -ENOMEM;
- put_online_mems();
+out_locked:
+ mutex_unlock(&slab_mutex);
put_online_cpus();
if (err) {
+
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
@@ -253,148 +233,54 @@ out_unlock:
name, err);
dump_stack();
}
- return NULL;
- }
- return s;
-}
-EXPORT_SYMBOL(kmem_cache_create);
-
-#ifdef CONFIG_MEMCG_KMEM
-/*
- * memcg_create_kmem_cache - Create a cache for a memory cgroup.
- * @memcg: The memory cgroup the new cache is for.
- * @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
- *
- * This function attempts to create a kmem cache that will serve allocation
- * requests going from @memcg to @root_cache. The new cache inherits properties
- * from its parent.
- */
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache,
- const char *memcg_name)
-{
- struct kmem_cache *s = NULL;
- char *cache_name;
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-
- cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name);
- if (!cache_name)
- goto out_unlock;
-
- s = do_kmem_cache_create(cache_name, root_cache->object_size,
- root_cache->size, root_cache->align,
- root_cache->flags, root_cache->ctor,
- memcg, root_cache);
- if (IS_ERR(s)) {
- kfree(cache_name);
- s = NULL;
+ return NULL;
}
-out_unlock:
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-
return s;
}
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- int rc;
-
- if (!s->memcg_params ||
- !s->memcg_params->is_root_cache)
- return 0;
-
- mutex_unlock(&slab_mutex);
- rc = __memcg_cleanup_cache_params(s);
- mutex_lock(&slab_mutex);
-
- return rc;
-}
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- return 0;
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-void slab_kmem_cache_release(struct kmem_cache *s)
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
+ return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
}
+EXPORT_SYMBOL(kmem_cache_create);
void kmem_cache_destroy(struct kmem_cache *s)
{
- get_online_cpus();
- get_online_mems();
+ /* Destroy all the children caches if we aren't a memcg cache */
+ kmem_cache_destroy_memcg_children(s);
+ get_online_cpus();
mutex_lock(&slab_mutex);
-
s->refcount--;
- if (s->refcount)
- goto out_unlock;
-
- if (memcg_cleanup_cache_params(s) != 0)
- goto out_unlock;
-
- if (__kmem_cache_shutdown(s) != 0) {
- printk(KERN_ERR "kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
- dump_stack();
- goto out_unlock;
+ if (!s->refcount) {
+ list_del(&s->list);
+
+ if (!__kmem_cache_shutdown(s)) {
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_release_cache(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ } else {
+ list_add(&s->list, &slab_caches);
+ mutex_unlock(&slab_mutex);
+ printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
+ dump_stack();
+ }
+ } else {
+ mutex_unlock(&slab_mutex);
}
-
- list_del(&s->list);
-
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_remove(s);
-#else
- slab_kmem_cache_release(s);
-#endif
- goto out;
-
-out_unlock:
- mutex_unlock(&slab_mutex);
-out:
- put_online_mems();
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
-{
- int ret;
-
- get_online_cpus();
- get_online_mems();
- ret = __kmem_cache_shrink(cachep);
- put_online_mems();
- put_online_cpus();
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
-
int slab_is_available(void)
{
return slab_state >= UP;
@@ -413,7 +299,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
err = __kmem_cache_create(s, flags);
if (err)
- panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
+ panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
name, size, err);
s->refcount = -1; /* Exempt from merging for now */
@@ -433,218 +319,10 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
return s;
}
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
-EXPORT_SYMBOL(kmalloc_caches);
-
-#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
-EXPORT_SYMBOL(kmalloc_dma_caches);
-#endif
-
-/*
- * Conversion table for small slabs sizes / 8 to the index in the
- * kmalloc array. This is necessary for slabs < 192 since we have non power
- * of two cache sizes there. The size of larger slabs can be determined using
- * fls.
- */
-static s8 size_index[24] = {
- 3, /* 8 */
- 4, /* 16 */
- 5, /* 24 */
- 5, /* 32 */
- 6, /* 40 */
- 6, /* 48 */
- 6, /* 56 */
- 6, /* 64 */
- 1, /* 72 */
- 1, /* 80 */
- 1, /* 88 */
- 1, /* 96 */
- 7, /* 104 */
- 7, /* 112 */
- 7, /* 120 */
- 7, /* 128 */
- 2, /* 136 */
- 2, /* 144 */
- 2, /* 152 */
- 2, /* 160 */
- 2, /* 168 */
- 2, /* 176 */
- 2, /* 184 */
- 2 /* 192 */
-};
-
-static inline int size_index_elem(size_t bytes)
-{
- return (bytes - 1) / 8;
-}
-
-/*
- * Find the kmem_cache structure that serves a given size of
- * allocation
- */
-struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
-{
- int index;
-
- if (unlikely(size > KMALLOC_MAX_SIZE)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- if (size <= 192) {
- if (!size)
- return ZERO_SIZE_PTR;
-
- index = size_index[size_index_elem(size)];
- } else
- index = fls(size - 1);
-
-#ifdef CONFIG_ZONE_DMA
- if (unlikely((flags & GFP_DMA)))
- return kmalloc_dma_caches[index];
-
-#endif
- return kmalloc_caches[index];
-}
-
-/*
- * Create the kmalloc array. Some of the regular kmalloc arrays
- * may already have been created because they were needed to
- * enable allocations for slab creation.
- */
-void __init create_kmalloc_caches(unsigned long flags)
-{
- int i;
-
- /*
- * Patch up the size_index table if we have strange large alignment
- * requirements for the kmalloc array. This is only the case for
- * MIPS it seems. The standard arches will not generate any code here.
- *
- * Largest permitted alignment is 256 bytes due to the way we
- * handle the index determination for the smaller caches.
- *
- * Make sure that nothing crazy happens if someone starts tinkering
- * around with ARCH_KMALLOC_MINALIGN
- */
- BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
- (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
-
- for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
- int elem = size_index_elem(i);
-
- if (elem >= ARRAY_SIZE(size_index))
- break;
- size_index[elem] = KMALLOC_SHIFT_LOW;
- }
-
- if (KMALLOC_MIN_SIZE >= 64) {
- /*
- * The 96 byte size cache is not used if the alignment
- * is 64 byte.
- */
- for (i = 64 + 8; i <= 96; i += 8)
- size_index[size_index_elem(i)] = 7;
-
- }
-
- if (KMALLOC_MIN_SIZE >= 128) {
- /*
- * The 192 byte sized cache is not used if the alignment
- * is 128 byte. Redirect kmalloc to use the 256 byte cache
- * instead.
- */
- for (i = 128 + 8; i <= 192; i += 8)
- size_index[size_index_elem(i)] = 8;
- }
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i]) {
- kmalloc_caches[i] = create_kmalloc_cache(NULL,
- 1 << i, flags);
- }
-
- /*
- * Caches that are not of the two-to-the-power-of size.
- * These have to be created immediately after the
- * earlier power of two caches
- */
- if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
-
- if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
- }
-
- /* Kmalloc array is now usable */
- slab_state = UP;
-
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
- char *n;
-
- if (s) {
- n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-
- BUG_ON(!n);
- s->name = n;
- }
- }
-
-#ifdef CONFIG_ZONE_DMA
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
-
- if (s) {
- int size = kmalloc_size(i);
- char *n = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%d", size);
-
- BUG_ON(!n);
- kmalloc_dma_caches[i] = create_kmalloc_cache(n,
- size, SLAB_CACHE_DMA | flags);
- }
- }
-#endif
-}
#endif /* !CONFIG_SLOB */
-/*
- * To avoid unnecessary overhead, we pass through large allocation requests
- * directly to the page allocator. We use __GFP_COMP, because we will need to
- * know the allocation order to free the pages properly in kfree.
- */
-void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
-{
- void *ret;
- struct page *page;
-
- flags |= __GFP_COMP;
- page = alloc_kmem_pages(flags, order);
- ret = page ? page_address(page) : NULL;
- kmemleak_alloc(ret, size, 1, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmalloc_order);
-
-#ifdef CONFIG_TRACING
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
-{
- void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmalloc_order_trace);
-#endif
#ifdef CONFIG_SLABINFO
-
-#ifdef CONFIG_SLAB
-#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
-#else
-#define SLABINFO_RIGHTS S_IRUSR
-#endif
-
void print_slabinfo_header(struct seq_file *m)
{
/*
@@ -679,12 +357,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
return seq_list_start(&slab_caches, *pos);
}
-void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
-void slab_stop(struct seq_file *m, void *p)
+static void s_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
@@ -700,7 +378,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
return;
for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
+ c = cache_from_memcg(s, i);
if (!c)
continue;
@@ -761,8 +439,8 @@ static int s_show(struct seq_file *m, void *p)
*/
static const struct seq_operations slabinfo_op = {
.start = s_start,
- .next = slab_next,
- .stop = slab_stop,
+ .next = s_next,
+ .stop = s_stop,
.show = s_show,
};
@@ -781,8 +459,7 @@ static const struct file_operations proc_slabinfo_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
- &proc_slabinfo_operations);
+ proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
return 0;
}
module_init(slab_proc_init);
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a..a99fdf7a090 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -111,18 +111,18 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->lru, list);
+ list_add(&sp->list, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->lru);
+ list_del(&sp->list);
__ClearPageSlobFree(sp);
}
#define SLOB_UNIT sizeof(slob_t)
-#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, lru) {
+ list_for_each_entry(sp, slob_list, list) {
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
continue;
/* Attempt to alloc */
- prev = sp->lru.prev;
+ prev = sp->list.prev;
b = slob_page_alloc(sp, size, align);
if (!b)
continue;
@@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->lru);
+ INIT_LIST_HEAD(&sp->list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
b = slob_page_alloc(sp, size, align);
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
clear_slob_page_free(sp);
spin_unlock_irqrestore(&slob_lock, flags);
__ClearPageSlab(sp);
- page_mapcount_reset(sp);
+ reset_page_mapcount(sp);
slob_free_pages(b, 0);
return;
}
@@ -462,11 +462,11 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
return ret;
}
-void *__kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
+ return __do_kmalloc_node(size, gfp, node, _RET_IP_);
}
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
#ifdef CONFIG_TRACING
void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
@@ -534,7 +534,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
return 0;
}
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
@@ -554,33 +554,13 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags, node);
}
- if (b && c->ctor)
+ if (c->ctor)
c->ctor(b);
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
}
-EXPORT_SYMBOL(slob_alloc_node);
-
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
- return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
-{
- return __do_kmalloc_node(size, gfp, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
-{
- return slob_alloc_node(cachep, gfp, node);
-}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
static void __kmem_cache_free(void *b, int size)
{
@@ -620,10 +600,11 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int __kmem_cache_shrink(struct kmem_cache *d)
+int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
+EXPORT_SYMBOL(kmem_cache_shrink);
struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 73004808537..ba2ca53f6c3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -18,7 +18,6 @@
#include <linux/slab.h>
#include "slab.h"
#include <linux/proc_fs.h>
-#include <linux/notifier.h>
#include <linux/seq_file.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
@@ -123,15 +122,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
-static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- return !kmem_cache_debug(s);
-#else
- return false;
-#endif
-}
-
/*
* Issues still to be resolved:
*
@@ -155,7 +145,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/*
* Maximum number of desirable partial slabs.
* The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in use.
+ * sort the partial list by the number of objects in the.
*/
#define MAX_PARTIAL 10
@@ -210,22 +200,21 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
+static void sysfs_slab_remove(struct kmem_cache *);
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- /*
- * The rmw is racy on a preemptible kernel but this is acceptable, so
- * avoid this_cpu_add()'s irq-disable overhead.
- */
- raw_cpu_inc(s->cpu_slab->stat[si]);
+ __this_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -356,21 +345,6 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
-{
- struct page tmp;
- tmp.counters = counters_new;
- /*
- * page->counters can cover frozen/inuse/objects as well
- * as page->_count. If we assign to ->counters directly
- * we run the risk of losing updates to page->_count, so
- * be careful and only assign to the fields we need.
- */
- page->frozen = tmp.frozen;
- page->inuse = tmp.inuse;
- page->objects = tmp.objects;
-}
-
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -389,10 +363,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
#endif
{
slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
+ if (page->freelist == freelist_old && page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
return 1;
}
@@ -403,7 +376,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
- pr_info("%s %s: cmpxchg double redo ", n, s->name);
+ printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
@@ -428,10 +401,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
local_irq_save(flags);
slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
+ if (page->freelist == freelist_old && page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
local_irq_restore(flags);
return 1;
@@ -444,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
- pr_info("%s %s: cmpxchg double redo ", n, s->name);
+ printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
@@ -546,14 +518,14 @@ static void print_track(const char *s, struct track *t)
if (!t->addr)
return;
- pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
- s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+ printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
int i;
for (i = 0; i < TRACK_ADDRS_COUNT; i++)
if (t->addrs[i])
- pr_err("\t%pS\n", (void *)t->addrs[i]);
+ printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
else
break;
}
@@ -571,37 +543,37 @@ static void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
- page, page->objects, page->inuse, page->freelist, page->flags);
+ printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
}
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
- struct va_format vaf;
va_list args;
+ char buf[100];
va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
- pr_err("-----------------------------------------------------------------------------\n\n");
-
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+ printk(KERN_ERR "========================================"
+ "=====================================\n");
+ printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
+ printk(KERN_ERR "----------------------------------------"
+ "-------------------------------------\n\n");
+
+ add_taint(TAINT_BAD_PAGE);
}
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
- struct va_format vaf;
va_list args;
+ char buf[100];
va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- pr_err("FIX %s: %pV\n", s->name, &vaf);
+ vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+ printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
}
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
@@ -613,8 +585,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_page_info(page);
- pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
- p, p - addr, get_freepointer(s, p));
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
@@ -647,8 +619,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page,
- const char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
{
va_list args;
char buf[100];
@@ -697,7 +668,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
end--;
slab_bug(s, "%s overwritten", what);
- pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault[0], value);
print_trailer(s, page, object);
@@ -807,8 +778,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
} else {
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
check_bytes_and_report(s, page, p, "Alignment padding",
- endobject, POISON_INUSE,
- s->inuse - s->object_size);
+ endobject, POISON_INUSE, s->inuse - s->object_size);
}
}
@@ -893,6 +863,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
object_err(s, page, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, page, "Freepointer corrupt");
page->freelist = NULL;
@@ -930,15 +901,14 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
int alloc)
{
if (s->flags & SLAB_TRACE) {
- pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
s->name,
alloc ? "alloc" : "free",
object, page->inuse,
page->freelist);
if (!alloc)
- print_section("Object ", (void *)object,
- s->object_size);
+ print_section("Object ", (void *)object, s->object_size);
dump_stack();
}
@@ -948,16 +918,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- kmemleak_alloc(ptr, size, 1, flags);
-}
-
-static inline void kfree_hook(const void *x)
-{
- kmemleak_free(x);
-}
-
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -967,8 +927,7 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
return should_failslab(s->object_size, flags, s->flags);
}
-static inline void slab_post_alloc_hook(struct kmem_cache *s,
- gfp_t flags, void *object)
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
{
flags &= gfp_allowed_mask;
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
@@ -980,7 +939,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kmemleak_free_recursive(x, s->flags);
/*
- * Trouble is that we may no longer disable interrupts in the fast path
+ * Trouble is that we may no longer disable interupts in the fast path
* So in order to make the debug calls that expect irqs to be
* disabled we need to disable interrupts temporarily.
*/
@@ -1000,6 +959,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
/*
* Tracking of fully allocated slabs for debugging purposes.
+ *
+ * list_lock must be held.
*/
static void add_full(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page)
@@ -1007,16 +968,17 @@ static void add_full(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
return;
- lockdep_assert_held(&n->list_lock);
list_add(&page->lru, &n->full);
}
-static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
+/*
+ * list_lock must be held.
+ */
+static void remove_full(struct kmem_cache *s, struct page *page)
{
if (!(s->flags & SLAB_STORE_USER))
return;
- lockdep_assert_held(&n->list_lock);
list_del(&page->lru);
}
@@ -1043,7 +1005,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
* dilemma by deferring the increment of the count during
* bootstrap (see early_kmem_cache_node_alloc).
*/
- if (likely(n)) {
+ if (n) {
atomic_long_inc(&n->nr_slabs);
atomic_long_add(objects, &n->total_objects);
}
@@ -1067,8 +1029,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
@@ -1133,8 +1094,9 @@ static noinline struct kmem_cache_node *free_debug_processing(
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
} else if (!page->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
+ printk(KERN_ERR
+ "SLUB <none>: no slab for object 0x%p.\n",
+ object);
dump_stack();
} else
object_err(s, page, object,
@@ -1217,8 +1179,8 @@ static int __init setup_slub_debug(char *str)
slub_debug |= SLAB_FAILSLAB;
break;
default:
- pr_err("slub_debug option '%c' unknown. skipped\n",
- *str);
+ printk(KERN_ERR "slub_debug option '%c' "
+ "unknown. skipped\n", *str);
}
}
@@ -1238,8 +1200,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
/*
* Enable debugging if selected on the kernel commandline.
*/
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
+ if (slub_debug && (!slub_debug_slabs ||
+ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
flags |= slub_debug;
return flags;
@@ -1261,8 +1223,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
@@ -1282,56 +1243,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- kmemleak_alloc(ptr, size, 1, flags);
-}
-
-static inline void kfree_hook(const void *x)
-{
- kmemleak_free(x);
-}
-
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{ return 0; }
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- void *object)
-{
- kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
- flags & gfp_allowed_mask);
-}
+ void *object) {}
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
-{
- kmemleak_free_recursive(x, s->flags);
-}
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
#endif /* CONFIG_SLUB_DEBUG */
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(struct kmem_cache *s,
- gfp_t flags, int node, struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
{
- struct page *page;
int order = oo_order(oo);
flags |= __GFP_NOTRACK;
- if (memcg_charge_slab(s, flags, order))
- return NULL;
-
if (node == NUMA_NO_NODE)
- page = alloc_pages(flags, order);
+ return alloc_pages(flags, order);
else
- page = alloc_pages_exact_node(node, flags, order);
-
- if (!page)
- memcg_uncharge_slab(s, order);
-
- return page;
+ return alloc_pages_exact_node(node, flags, order);
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1353,15 +1288,14 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- page = alloc_slab_page(s, alloc_gfp, node, oo);
+ page = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
- alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(s, alloc_gfp, node, oo);
+ page = alloc_slab_page(flags, node, oo);
if (page)
stat(s, ORDER_FALLBACK);
@@ -1371,7 +1305,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
- kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
+ kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
/*
* Objects from caches that have a constructor don't get
@@ -1422,6 +1356,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
order = compound_order(page);
inc_slabs_node(s, page_to_nid(page), page->objects);
+ memcg_bind_pages(s, order);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
@@ -1472,11 +1407,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
- page_mapcount_reset(page);
+ memcg_release_pages(s, order);
+ reset_page_mapcount(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_pages(page, order);
- memcg_uncharge_slab(s, order);
+ __free_memcg_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -1525,9 +1460,11 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
/*
* Management of partially allocated slabs.
+ *
+ * list_lock must be held.
*/
-static inline void
-__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
+static inline void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
@@ -1536,25 +1473,14 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
list_add(&page->lru, &n->partial);
}
-static inline void add_partial(struct kmem_cache_node *n,
- struct page *page, int tail)
-{
- lockdep_assert_held(&n->list_lock);
- __add_partial(n, page, tail);
-}
-
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
- list_del(&page->lru);
- n->nr_partial--;
-}
-
+/*
+ * list_lock must be held.
+ */
static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
- lockdep_assert_held(&n->list_lock);
- __remove_partial(n, page);
+ list_del(&page->lru);
+ n->nr_partial--;
}
/*
@@ -1562,17 +1488,17 @@ static inline void remove_partial(struct kmem_cache_node *n,
* return the pointer to the freelist.
*
* Returns a list of objects or NULL if it fails.
+ *
+ * Must hold list_lock since we modify the partial list.
*/
static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
- int mode, int *objects)
+ int mode)
{
void *freelist;
unsigned long counters;
struct page new;
- lockdep_assert_held(&n->list_lock);
-
/*
* Zap the freelist and set the frozen bit.
* The old freelist is the list of objects for the
@@ -1581,7 +1507,6 @@ static inline void *acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- *objects = new.objects - new.inuse;
if (mode) {
new.inuse = page->objects;
new.freelist = NULL;
@@ -1603,7 +1528,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
return freelist;
}
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
/*
@@ -1614,8 +1539,6 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
{
struct page *page, *page2;
void *object = NULL;
- int available = 0;
- int objects;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -1629,25 +1552,25 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t;
+ int available;
if (!pfmemalloc_match(page, flags))
continue;
- t = acquire_slab(s, n, page, object == NULL, &objects);
+ t = acquire_slab(s, n, page, object == NULL);
if (!t)
break;
- available += objects;
if (!object) {
c->page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
+ available = page->objects - page->inuse;
} else {
- put_cpu_partial(s, page, 0);
+ available = put_cpu_partial(s, page, 0);
stat(s, CPU_PARTIAL_NODE);
}
- if (!kmem_cache_has_cpu_partial(s)
- || available > s->cpu_partial / 2)
+ if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
break;
}
@@ -1692,8 +1615,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
return NULL;
do {
- cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1704,17 +1627,19 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
object = get_partial_node(s, n, c, flags);
if (object) {
/*
- * Don't check read_mems_allowed_retry()
- * here - if mems_allowed was updated in
- * parallel, that was a harmless race
- * between allocation and the cpuset
- * update
+ * Return the object even if
+ * put_mems_allowed indicated that
+ * the cpuset mems_allowed was
+ * updated in parallel. It's a
+ * harmless race between the alloc
+ * and the cpuset update.
*/
+ put_mems_allowed(cpuset_mems_cookie);
return object;
}
}
}
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
+ } while (!put_mems_allowed(cpuset_mems_cookie));
#endif
return NULL;
}
@@ -1726,7 +1651,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
struct kmem_cache_cpu *c)
{
void *object;
- int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -1776,19 +1701,19 @@ static inline void note_cmpxchg_failure(const char *n,
#ifdef SLUB_DEBUG_CMPXCHG
unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
- pr_info("%s %s: cmpxchg redo ", n, s->name);
+ printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
#ifdef CONFIG_PREEMPT
if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
- pr_warn("due to cpu change %d -> %d\n",
+ printk("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
else
#endif
if (tid_to_event(tid) != tid_to_event(actual_tid))
- pr_warn("due to cpu running other code. Event %ld->%ld\n",
+ printk("due to cpu running other code. Event %ld->%ld\n",
tid_to_event(tid), tid_to_event(actual_tid));
else
- pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
actual_tid, tid, next_tid(tid));
#endif
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
@@ -1805,8 +1730,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
/*
* Remove the cpu slab
*/
-static void deactivate_slab(struct kmem_cache *s, struct page *page,
- void *freelist)
+static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1881,7 +1805,7 @@ redo:
new.frozen = 0;
- if (!new.inuse && n->nr_partial >= s->min_partial)
+ if (!new.inuse && n->nr_partial > s->min_partial)
m = M_FREE;
else if (new.freelist) {
m = M_PARTIAL;
@@ -1915,7 +1839,7 @@ redo:
else if (l == M_FULL)
- remove_full(s, n, page);
+ remove_full(s, page);
if (m == M_PARTIAL) {
@@ -1957,7 +1881,6 @@ redo:
static void unfreeze_partials(struct kmem_cache *s,
struct kmem_cache_cpu *c)
{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
struct kmem_cache_node *n = NULL, *n2 = NULL;
struct page *page, *discard_page = NULL;
@@ -1992,7 +1915,7 @@ static void unfreeze_partials(struct kmem_cache *s,
new.freelist, new.counters,
"unfreezing slab"));
- if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
+ if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
page->next = discard_page;
discard_page = page;
} else {
@@ -2012,7 +1935,6 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
}
/*
@@ -2024,9 +1946,8 @@ static void unfreeze_partials(struct kmem_cache *s,
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *oldpage;
int pages;
int pobjects;
@@ -2062,9 +1983,8 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
page->pobjects = pobjects;
page->next = oldpage;
- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
- != oldpage);
-#endif
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ return pobjects;
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2121,25 +2041,17 @@ static void flush_all(struct kmem_cache *s)
static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
- if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
return 0;
#endif
return 1;
}
-#ifdef CONFIG_SLUB_DEBUG
static int count_free(struct page *page)
{
return page->objects - page->inuse;
}
-static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
-{
- return atomic_long_read(&n->total_objects);
-}
-#endif /* CONFIG_SLUB_DEBUG */
-
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
{
@@ -2153,28 +2065,31 @@ static unsigned long count_partial(struct kmem_cache_node *n,
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+
+static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ return atomic_long_read(&n->total_objects);
+#else
+ return 0;
+#endif
+}
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
-#ifdef CONFIG_SLUB_DEBUG
- static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
int node;
- if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
- return;
-
- pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+ printk(KERN_WARNING
+ "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
nid, gfpflags);
- pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
- s->name, s->object_size, s->size, oo_order(s->oo),
- oo_order(s->min));
+ printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
+ "default order: %d, min order: %d\n", s->name, s->object_size,
+ s->size, oo_order(s->oo), oo_order(s->min));
if (oo_order(s->min) > get_order(s->object_size))
- pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
- s->name);
+ printk(KERN_WARNING " %s debugging increased min order, use "
+ "slub_debug=O to disable.\n", s->name);
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -2189,10 +2104,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
nr_slabs = node_nr_slabs(n);
nr_objs = node_nr_objs(n);
- pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
+ printk(KERN_WARNING
+ " node %d: slabs: %ld, objs: %ld, free: %ld\n",
node, nr_slabs, nr_objs, nr_free);
}
-#endif
}
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
@@ -2209,7 +2124,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
page = new_slab(s, flags, node);
if (page) {
- c = raw_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu_slab);
if (c->page)
flush_slab(s, c);
@@ -2238,8 +2153,8 @@ static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
}
/*
- * Check the page->freelist of a page and either transfer the freelist to the
- * per cpu freelist or deactivate the page.
+ * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
+ * or deactivate the page.
*
* The page is still frozen if the return value is not NULL.
*
@@ -2334,6 +2249,8 @@ redo:
if (freelist)
goto load_freelist;
+ stat(s, ALLOC_SLOWPATH);
+
freelist = get_freelist(s, page);
if (!freelist) {
@@ -2369,7 +2286,9 @@ new_slab:
freelist = new_slab_objects(s, gfpflags, node, &c);
if (unlikely(!freelist)) {
- slab_out_of_memory(s, gfpflags, node);
+ if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(s, gfpflags, node);
+
local_irq_restore(flags);
return NULL;
}
@@ -2379,8 +2298,7 @@ new_slab:
goto load_freelist;
/* Only entered in the debug case */
- if (kmem_cache_debug(s) &&
- !alloc_debug_processing(s, page, freelist, addr))
+ if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist));
@@ -2413,19 +2331,14 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
s = memcg_kmem_get_cache(s, gfpflags);
redo:
+
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
* enabled. We may switch back and forth between cpus while
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
- *
- * Preemption is disabled for the retrieval of the tid because that
- * must occur from the current processor. We cannot allow rescheduling
- * on a different processor between the determination of the pointer
- * and the retrieval of the tid.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu_slab);
/*
* The transaction ids are globally unique per cpu and per operation on
@@ -2434,29 +2347,27 @@ redo:
* linked list in between.
*/
tid = c->tid;
- preempt_enable();
+ barrier();
object = c->freelist;
page = c->page;
- if (unlikely(!object || !node_match(page, node))) {
+ if (unlikely(!object || !node_match(page, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
- stat(s, ALLOC_SLOWPATH);
- } else {
+
+ else {
void *next_object = get_freepointer_safe(s, object);
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
*
- * The cmpxchg does the following atomically (without lock
- * semantics!)
+ * The cmpxchg does the following atomically (without lock semantics!)
* 1. Relocate first pointer to the current per cpu area.
* 2. Verify that tid and freelist have not been changed
* 3. If they were not changed replace tid and freelist
*
- * Since this is without lock semantics the protection is only
- * against code executing on this cpu *not* from access by
- * other cpus.
+ * Since this is without lock semantics the protection is only against
+ * code executing on this cpu *not* from access by other cpus.
*/
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
@@ -2488,8 +2399,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
- s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
return ret;
}
@@ -2503,6 +2413,14 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
#endif
#ifdef CONFIG_NUMA
@@ -2570,17 +2488,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen) {
- if (kmem_cache_has_cpu_partial(s) && !prior) {
+ if (!kmem_cache_debug(s) && !prior)
/*
- * Slab was on no list before and will be
- * partially empty
- * We can defer the list move and instead
- * freeze it.
+ * Slab was on no list before and will be partially empty
+ * We can defer the list move and instead freeze it.
*/
new.frozen = 1;
- } else { /* Needs to be taken off a list */
+ else { /* Needs to be taken off a list */
n = get_node(s, page_to_nid(page));
/*
@@ -2620,16 +2536,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
}
- if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
+ if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
goto slab_empty;
/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
- if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ if (kmem_cache_debug(s) && unlikely(!prior)) {
+ remove_full(s, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -2643,10 +2558,9 @@ slab_empty:
*/
remove_partial(n, page);
stat(s, FREE_REMOVE_PARTIAL);
- } else {
+ } else
/* Slab must be on the full list */
- remove_full(s, n, page);
- }
+ remove_full(s, page);
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
@@ -2680,11 +2594,10 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succedd.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu_slab);
tid = c->tid;
- preempt_enable();
+ barrier();
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
@@ -2862,7 +2775,7 @@ init_kmem_cache_node(struct kmem_cache_node *n)
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
- KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+ SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
/*
* Must align to double word boundary for the double cmpxchg
@@ -2886,8 +2799,8 @@ static struct kmem_cache *kmem_cache_node;
* slab on the node for this slabcache. There are no concurrent accesses
* possible.
*
- * Note that this function only works on the kmem_cache_node
- * when allocating for the kmem_cache_node. This is used for bootstrapping
+ * Note that this function only works on the kmalloc_node_cache
+ * when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
static void early_kmem_cache_node_alloc(int node)
@@ -2901,8 +2814,10 @@ static void early_kmem_cache_node_alloc(int node)
BUG_ON(!page);
if (page_to_nid(page) != node) {
- pr_err("SLUB: Unable to allocate memory from node %d\n", node);
- pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
+ printk(KERN_ERR "SLUB: Unable to allocate memory from "
+ "node %d\n", node);
+ printk(KERN_ERR "SLUB: Allocating a useless per node structure "
+ "in order to be able to continue\n");
}
n = page->freelist;
@@ -2918,11 +2833,7 @@ static void early_kmem_cache_node_alloc(int node)
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
- /*
- * No locks need to be taken here as it has just been
- * initialized and there is no concurrent access.
- */
- __add_partial(n, page, DEACTIVATE_TO_HEAD);
+ add_partial(n, page, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3071,7 +2982,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
s->allocflags |= __GFP_COMP;
if (s->flags & SLAB_CACHE_DMA)
- s->allocflags |= GFP_DMA;
+ s->allocflags |= SLUB_DMA;
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
@@ -3137,10 +3048,10 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
* A) The number of objects from per cpu partial slabs dumped to the
* per node list when we reach the limit.
* B) The number of objects in cpu partial slabs to extract from the
- * per node list when we run out of per cpu objects. We only fetch
- * 50% to keep some capacity around for frees.
+ * per node list when we run out of per cpu objects. We only fetch 50%
+ * to keep some capacity around for frees.
*/
- if (!kmem_cache_has_cpu_partial(s))
+ if (kmem_cache_debug(s))
s->cpu_partial = 0;
else if (s->size >= PAGE_SIZE)
s->cpu_partial = 2;
@@ -3165,8 +3076,8 @@ error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u "
"order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
+ s->offset, flags);
return -EINVAL;
}
@@ -3187,7 +3098,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
- pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
+ p, p - addr);
print_tracking(s, p);
}
}
@@ -3207,7 +3119,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
@@ -3239,13 +3151,36 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int __kmem_cache_shutdown(struct kmem_cache *s)
{
- return kmem_cache_close(s);
+ int rc = kmem_cache_close(s);
+
+ if (!rc) {
+ /*
+ * We do the same lock strategy around sysfs_slab_add, see
+ * __kmem_cache_create. Because this is pretty much the last
+ * operation we do and the lock will be released shortly after
+ * that in slab_common.c, we could just move sysfs_slab_remove
+ * to a later point in common code. We should do that when we
+ * have a common sysfs framework for all allocators.
+ */
+ mutex_unlock(&slab_mutex);
+ sysfs_slab_remove(s);
+ mutex_lock(&slab_mutex);
+ }
+
+ return rc;
}
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
+EXPORT_SYMBOL(kmalloc_caches);
+
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
+
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -3282,15 +3217,73 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static inline int size_index_elem(size_t bytes)
+{
+ return (bytes - 1) / 8;
+}
+
+static struct kmem_cache *get_slab(size_t size, gfp_t flags)
+{
+ int index;
+
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ index = size_index[size_index_elem(size)];
+ } else
+ index = fls(size - 1);
+
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely((flags & SLUB_DMA)))
+ return kmalloc_dma_caches[index];
+
+#endif
+ return kmalloc_caches[index];
+}
+
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
void *ret;
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ if (unlikely(size > SLUB_MAX_SIZE))
return kmalloc_large(size, flags);
- s = kmalloc_slab(size, flags);
+ s = get_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
@@ -3309,12 +3302,12 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK;
- page = alloc_kmem_pages_node(node, flags, get_order(size));
+ flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
+ page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
- kmalloc_large_node_hook(ptr, size, flags);
+ kmemleak_alloc(ptr, size, 1, flags);
return ptr;
}
@@ -3323,7 +3316,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
struct kmem_cache *s;
void *ret;
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ if (unlikely(size > SLUB_MAX_SIZE)) {
ret = kmalloc_large_node(size, flags, node);
trace_kmalloc_node(_RET_IP_, ret,
@@ -3333,7 +3326,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return ret;
}
- s = kmalloc_slab(size, flags);
+ s = get_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
@@ -3365,6 +3358,42 @@ size_t ksize(const void *object)
}
EXPORT_SYMBOL(ksize);
+#ifdef CONFIG_SLUB_DEBUG
+bool verify_mem_not_deleted(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+ unsigned long flags;
+ bool rv;
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return false;
+
+ local_irq_save(flags);
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ /* maybe it was from stack? */
+ rv = true;
+ goto out_unlock;
+ }
+
+ slab_lock(page);
+ if (on_freelist(page->slab_cache, page, object)) {
+ object_err(page->slab_cache, page, object, "Object is on free-list");
+ rv = false;
+ } else {
+ rv = true;
+ }
+ slab_unlock(page);
+
+out_unlock:
+ local_irq_restore(flags);
+ return rv;
+}
+EXPORT_SYMBOL(verify_mem_not_deleted);
+#endif
+
void kfree(const void *x)
{
struct page *page;
@@ -3378,8 +3407,8 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
- kfree_hook(x);
- __free_kmem_pages(page, compound_order(page));
+ kmemleak_free(x);
+ __free_memcg_kmem_pages(page, compound_order(page));
return;
}
slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3396,7 +3425,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3452,14 +3481,16 @@ int __kmem_cache_shrink(struct kmem_cache *s)
kfree(slabs_by_inuse);
return 0;
}
+EXPORT_SYMBOL(kmem_cache_shrink);
+#if defined(CONFIG_MEMORY_HOTPLUG)
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);
return 0;
@@ -3567,10 +3598,7 @@ static int slab_memory_callback(struct notifier_block *self,
return ret;
}
-static struct notifier_block slab_memory_callback_nb = {
- .notifier_call = slab_memory_callback,
- .priority = SLAB_CALLBACK_PRI,
-};
+#endif /* CONFIG_MEMORY_HOTPLUG */
/********************************************************************
* Basic setup of slabs
@@ -3589,12 +3617,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
memcpy(s, static_cache, kmem_cache->object_size);
- /*
- * This runs very early, and only the boot processor is supposed to be
- * up. Even if it weren't true, IRQs are not up so we couldn't fire
- * IPIs around.
- */
- __flush_cpu_slab(s, smp_processor_id());
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
struct page *p;
@@ -3617,6 +3639,8 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int i;
+ int caches = 2;
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -3627,7 +3651,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
- register_hotmemory_notifier(&slab_memory_callback_nb);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
@@ -3647,14 +3671,103 @@ void __init kmem_cache_init(void)
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
- create_kmalloc_caches(0);
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+ int elem = size_index_elem(i);
+ if (elem >= ARRAY_SIZE(size_index))
+ break;
+ size_index[elem] = KMALLOC_SHIFT_LOW;
+ }
+
+ if (KMALLOC_MIN_SIZE == 64) {
+ /*
+ * The 96 byte size cache is not used if the alignment
+ * is 64 byte.
+ */
+ for (i = 64 + 8; i <= 96; i += 8)
+ size_index[size_index_elem(i)] = 7;
+ } else if (KMALLOC_MIN_SIZE == 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[size_index_elem(i)] = 8;
+ }
+
+ /* Caches that are not of the two-to-the-power-of size */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+ caches++;
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+ caches++;
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+ caches++;
+ }
+
+ slab_state = UP;
+
+ /* Provide the correct kmalloc names now that the caches are up */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[1]->name);
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[2]->name);
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+
+ BUG_ON(!s);
+ kmalloc_caches[i]->name = s;
+ }
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
#endif
- pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
- cache_line_size(),
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+
+ if (s && s->size) {
+ char *name = kasprintf(GFP_NOWAIT,
+ "dma-kmalloc-%d", s->object_size);
+
+ BUG_ON(!name);
+ kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+ s->object_size, SLAB_CACHE_DMA);
+ }
+ }
+#endif
+ printk(KERN_INFO
+ "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
+ " CPUs=%d, Nodes=%d\n",
+ caches, cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
}
@@ -3671,9 +3784,6 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
- if (!is_root_cache(s))
- return 1;
-
if (s->ctor)
return 1;
@@ -3686,8 +3796,9 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(size_t size, size_t align,
- unsigned long flags, const char *name, void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
+ size_t align, unsigned long flags, const char *name,
+ void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3710,7 +3821,7 @@ static struct kmem_cache *find_mergeable(size_t size, size_t align,
continue;
if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- continue;
+ continue;
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
@@ -3721,24 +3832,23 @@ static struct kmem_cache *find_mergeable(size_t size, size_t align,
if (s->size - size >= sizeof(void *))
continue;
+ if (!cache_match_memcg(s, memcg))
+ continue;
+
return s;
}
return NULL;
}
struct kmem_cache *
-__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- s = find_mergeable(size, align, flags, name, ctor);
+ s = find_mergeable(memcg, size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
-
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
@@ -3746,15 +3856,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
- c->object_size = s->object_size;
- c->inuse = max_t(int, c->inuse,
- ALIGN(size, sizeof(void *)));
- }
-
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -3777,7 +3878,10 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
return 0;
memcg_propagate_slab_attrs(s);
+ mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
+ mutex_lock(&slab_mutex);
+
if (err)
kmem_cache_close(s);
@@ -3789,7 +3893,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
* Use the cpu notifier to insure that the cpu slabs are flushed when
* necessary.
*/
-static int slab_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -3815,7 +3919,7 @@ static int slab_cpuup_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block slab_notifier = {
+static struct notifier_block __cpuinitdata slab_notifier = {
.notifier_call = slab_cpuup_callback
};
@@ -3826,10 +3930,10 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
struct kmem_cache *s;
void *ret;
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ if (unlikely(size > SLUB_MAX_SIZE))
return kmalloc_large(size, gfpflags);
- s = kmalloc_slab(size, gfpflags);
+ s = get_slab(size, gfpflags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
@@ -3849,7 +3953,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
struct kmem_cache *s;
void *ret;
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ if (unlikely(size > SLUB_MAX_SIZE)) {
ret = kmalloc_large_node(size, gfpflags, node);
trace_kmalloc_node(caller, ret,
@@ -3859,7 +3963,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
- s = kmalloc_slab(size, gfpflags);
+ s = get_slab(size, gfpflags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
@@ -3935,8 +4039,8 @@ static int validate_slab_node(struct kmem_cache *s,
count++;
}
if (count != n->nr_partial)
- pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
- s->name, count, n->nr_partial);
+ printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
+ "counter=%ld\n", s->name, count, n->nr_partial);
if (!(s->flags & SLAB_STORE_USER))
goto out;
@@ -3946,8 +4050,9 @@ static int validate_slab_node(struct kmem_cache *s,
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
- pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
- s->name, count, atomic_long_read(&n->nr_slabs));
+ printk(KERN_ERR "SLUB: %s %ld slabs counted but "
+ "counter=%ld\n", s->name, count,
+ atomic_long_read(&n->nr_slabs));
out:
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4180,17 +4285,15 @@ static int list_locations(struct kmem_cache *s, char *buf,
!cpumask_empty(to_cpumask(l->cpus)) &&
len < PAGE_SIZE - 60) {
len += sprintf(buf + len, " cpus=");
- len += cpulist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
+ len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
to_cpumask(l->cpus));
}
if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
len < PAGE_SIZE - 60) {
len += sprintf(buf + len, " nodes=");
- len += nodelist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- l->nodes);
+ len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
+ l->nodes);
}
len += sprintf(buf + len, "\n");
@@ -4209,52 +4312,55 @@ static void resiliency_test(void)
{
u8 *p;
- BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
- pr_err("SLUB resiliency testing\n");
- pr_err("-----------------------\n");
- pr_err("A. Corruption after allocation\n");
+ printk(KERN_ERR "SLUB resiliency testing\n");
+ printk(KERN_ERR "-----------------------\n");
+ printk(KERN_ERR "A. Corruption after allocation\n");
p = kzalloc(16, GFP_KERNEL);
p[16] = 0x12;
- pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
- p + 16);
+ printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+ " 0x12->0x%p\n\n", p + 16);
validate_slab_cache(kmalloc_caches[4]);
/* Hmmm... The next two are dangerous */
p = kzalloc(32, GFP_KERNEL);
p[32 + sizeof(void *)] = 0x34;
- pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
+ printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches[5]);
p = kzalloc(64, GFP_KERNEL);
p += 64 + (get_cycles() & 0xff) * sizeof(void *);
*p = 0x56;
- pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- pr_err("If allocated object is overwritten then not detectable\n\n");
+ printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches[6]);
- pr_err("\nB. Corruption after free\n");
+ printk(KERN_ERR "\nB. Corruption after free\n");
p = kzalloc(128, GFP_KERNEL);
kfree(p);
*p = 0x78;
- pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
validate_slab_cache(kmalloc_caches[7]);
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
- pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
validate_slab_cache(kmalloc_caches[8]);
p = kzalloc(512, GFP_KERNEL);
kfree(p);
p[512] = 0xab;
- pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
validate_slab_cache(kmalloc_caches[9]);
}
#else
@@ -4285,17 +4391,18 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int node;
int x;
unsigned long *nodes;
+ unsigned long *per_cpu;
- nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
+ nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
if (!nodes)
return -ENOMEM;
+ per_cpu = nodes + nr_node_ids;
if (flags & SO_CPU) {
int cpu;
for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
- cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
int node;
struct page *page;
@@ -4316,30 +4423,27 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
page = ACCESS_ONCE(c->partial);
if (page) {
- node = page_to_nid(page);
- if (flags & SO_TOTAL)
- WARN_ON_ONCE(1);
- else if (flags & SO_OBJECTS)
- WARN_ON_ONCE(1);
- else
- x = page->pages;
+ x = page->pobjects;
total += x;
nodes[node] += x;
}
+
+ per_cpu[node]++;
}
}
- get_online_mems();
+ lock_memory_hotplug();
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
- if (flags & SO_TOTAL)
- x = atomic_long_read(&n->total_objects);
- else if (flags & SO_OBJECTS)
- x = atomic_long_read(&n->total_objects) -
- count_partial(n, count_free);
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
+
else
x = atomic_long_read(&n->nr_slabs);
total += x;
@@ -4369,7 +4473,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
+ unlock_memory_hotplug();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -4439,7 +4543,7 @@ static ssize_t order_store(struct kmem_cache *s,
unsigned long order;
int err;
- err = kstrtoul(buf, 10, &order);
+ err = strict_strtoul(buf, 10, &order);
if (err)
return err;
@@ -4467,7 +4571,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
unsigned long min;
int err;
- err = kstrtoul(buf, 10, &min);
+ err = strict_strtoul(buf, 10, &min);
if (err)
return err;
@@ -4487,10 +4591,10 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
unsigned long objects;
int err;
- err = kstrtoul(buf, 10, &objects);
+ err = strict_strtoul(buf, 10, &objects);
if (err)
return err;
- if (objects && !kmem_cache_has_cpu_partial(s))
+ if (objects && kmem_cache_debug(s))
return -EINVAL;
s->cpu_partial = objects;
@@ -4803,7 +4907,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
unsigned long ratio;
int err;
- err = kstrtoul(buf, 10, &ratio);
+ err = strict_strtoul(buf, 10, &ratio);
if (err)
return err;
@@ -5033,7 +5137,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* through the descendants with best-effort propagation.
*/
for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
+ struct kmem_cache *c = cache_from_memcg(s, i);
if (c)
attribute->store(c, buf, len);
}
@@ -5048,18 +5152,15 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
#ifdef CONFIG_MEMCG_KMEM
int i;
char *buffer = NULL;
- struct kmem_cache *root_cache;
- if (is_root_cache(s))
+ if (!is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
-
/*
* This mean this cache had no attribute written. Therefore, no point
* in copying default values around
*/
- if (!root_cache->max_attr_size)
+ if (!s->max_attr_size)
return;
for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
@@ -5081,7 +5182,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (s->max_attr_size < ARRAY_SIZE(mbuf))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5090,7 +5191,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(root_cache, buf);
+ attr->show(s->memcg_params->root_cache, buf);
attr->store(s, buf, strlen(buf));
}
@@ -5099,11 +5200,6 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
#endif
}
-static void kmem_cache_release(struct kobject *k)
-{
- slab_kmem_cache_release(to_slab(k));
-}
-
static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -5111,7 +5207,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
static struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
- .release = kmem_cache_release,
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5129,15 +5224,6 @@ static const struct kset_uevent_ops slab_uevent_ops = {
static struct kset *slab_kset;
-static inline struct kset *cache_kset(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG_KMEM
- if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
-#endif
- return slab_kset;
-}
-
#define ID_STR_LENGTH 64
/* Create a unique string id for a slab cache:
@@ -5173,8 +5259,7 @@ static char *create_unique_id(struct kmem_cache *s)
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- p += sprintf(p, "-%08d",
- memcg_cache_id(s->memcg_params->memcg));
+ p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
#endif
BUG_ON(p > name + ID_STR_LENGTH - 1);
@@ -5203,42 +5288,29 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = cache_kset(s);
- err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err)
- goto out_put_kobj;
+ s->kobj.kset = slab_kset;
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
+ if (err) {
+ kobject_put(&s->kobj);
+ return err;
+ }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
- if (err)
- goto out_del_kobj;
-
-#ifdef CONFIG_MEMCG_KMEM
- if (is_root_cache(s)) {
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
- if (!s->memcg_kset) {
- err = -ENOMEM;
- goto out_del_kobj;
- }
+ if (err) {
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+ return err;
}
-#endif
-
kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
- }
-out:
- if (!unmergeable)
kfree(name);
- return err;
-out_del_kobj:
- kobject_del(&s->kobj);
-out_put_kobj:
- kobject_put(&s->kobj);
- goto out;
+ }
+ return 0;
}
-void sysfs_slab_remove(struct kmem_cache *s)
+static void sysfs_slab_remove(struct kmem_cache *s)
{
if (slab_state < FULL)
/*
@@ -5247,9 +5319,6 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
-#ifdef CONFIG_MEMCG_KMEM
- kset_unregister(s->memcg_kset);
-#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
@@ -5300,7 +5369,7 @@ static int __init slab_sysfs_init(void)
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
- pr_err("Cannot register slab subsystem.\n");
+ printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -5309,8 +5378,8 @@ static int __init slab_sysfs_init(void)
list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
if (err)
- pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
- s->name);
+ printk(KERN_ERR "SLUB: Unable to add boot slab %s"
+ " to sysfs\n", s->name);
}
while (alias_list) {
@@ -5319,8 +5388,8 @@ static int __init slab_sysfs_init(void)
alias_list = alias_list->next;
err = sysfs_slab_alias(al->s, al->name);
if (err)
- pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
- al->name);
+ printk(KERN_ERR "SLUB: Unable to add boot slab alias"
+ " %s to sysfs\n", al->name);
kfree(al);
}
@@ -5338,6 +5407,7 @@ __initcall(slab_sysfs_init);
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
+ unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
unsigned long nr_objs = 0;
unsigned long nr_free = 0;
@@ -5349,8 +5419,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
if (!n)
continue;
- nr_slabs += node_nr_slabs(n);
- nr_objs += node_nr_objs(n);
+ nr_partials += n->nr_partial;
+ nr_slabs += atomic_long_read(&n->nr_slabs);
+ nr_objs += atomic_long_read(&n->total_objects);
nr_free += count_partial(n, count_free);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2783a..1b7e22ab9b0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,8 +40,7 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_virt_alloc_try_nid(size, align, goal,
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
}
static void *vmemmap_buf;
@@ -54,12 +53,10 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
struct page *page;
if (node_state(node, N_HIGH_MEMORY))
- page = alloc_pages_node(
- node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
- get_order(size));
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO, get_order(size));
else
- page = alloc_pages(
- GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
get_order(size));
if (page)
return page_address(page);
@@ -148,10 +145,11 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start,
- unsigned long end, int node)
+int __meminit vmemmap_populate_basepages(struct page *start_page,
+ unsigned long size, int node)
{
- unsigned long addr = start;
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
@@ -178,15 +176,9 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
{
- unsigned long start;
- unsigned long end;
- struct page *map;
-
- map = pfn_to_page(pnum * PAGES_PER_SECTION);
- start = (unsigned long)map;
- end = (unsigned long)(map + PAGES_PER_SECTION);
-
- if (vmemmap_populate(start, end, nid))
+ struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+ if (error)
return NULL;
return map;
@@ -227,8 +219,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
- memblock_free_early(__pa(vmemmap_buf),
- vmemmap_buf_end - vmemmap_buf);
+ free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b691ac..6b5fb762e2c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,12 +5,10 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
-#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
-
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -71,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
else
section = kzalloc(array_size, GFP_KERNEL);
} else {
- section = memblock_virt_alloc_node(array_size, nid);
+ section = alloc_bootmem_node(NODE_DATA(nid), array_size);
}
return section;
@@ -81,6 +79,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
{
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
+ int ret = 0;
if (mem_section[root])
return -EEXIST;
@@ -91,7 +90,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
mem_section[root] = section;
- return 0;
+ return ret;
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
@@ -270,7 +269,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
- * other sections referencing the usemap remain active. Similarly,
+ * other sections referencing the usemap retmain active. Similarly,
* a pgdat can prevent a section being removed. If section A
* contains a pgdat and section B contains the usemap, both
* sections become inter-dependent. This allocates usemaps
@@ -281,9 +280,8 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = memblock_virt_alloc_try_nid_nopanic(size,
- SMP_CACHE_BYTES, goal, limit,
- nid);
+ p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+ SMP_CACHE_BYTES, goal, limit);
if (!p && limit) {
limit = 0;
goto again;
@@ -334,7 +332,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+ return alloc_bootmem_node_nopanic(pgdat, size);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -342,14 +340,13 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static void __init sparse_early_usemaps_alloc_node(void *data,
+static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long usemap_count, int nodeid)
{
void *usemap;
unsigned long pnum;
- unsigned long **usemap_map = (unsigned long **)data;
int size = usemap_size();
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
@@ -379,9 +376,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
- map = memblock_virt_alloc_try_nid(size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -405,9 +401,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+ map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
@@ -436,12 +431,11 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(void *data,
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
unsigned long pnum_begin,
unsigned long pnum_end,
unsigned long map_count, int nodeid)
{
- struct page **map_map = (struct page **)data;
sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
map_count, nodeid);
}
@@ -463,59 +457,10 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
}
#endif
-void __weak __meminit vmemmap_populate_print_last(void)
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
}
-/**
- * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- * @map: usemap_map for pageblock flags or mmap_map for vmemmap
- */
-static void __init alloc_usemap_and_memmap(void (*alloc_func)
- (void *, unsigned long, unsigned long,
- unsigned long, int), void *data)
-{
- unsigned long pnum;
- unsigned long map_count;
- int nodeid_begin = 0;
- unsigned long pnum_begin = 0;
-
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- struct mem_section *ms;
-
- if (!present_section_nr(pnum))
- continue;
- ms = __nr_to_section(pnum);
- nodeid_begin = sparse_early_nid(ms);
- pnum_begin = pnum;
- break;
- }
- map_count = 1;
- for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
- struct mem_section *ms;
- int nodeid;
-
- if (!present_section_nr(pnum))
- continue;
- ms = __nr_to_section(pnum);
- nodeid = sparse_early_nid(ms);
- if (nodeid == nodeid_begin) {
- map_count++;
- continue;
- }
- /* ok, we need to take cake of from pnum_begin to pnum - 1*/
- alloc_func(data, pnum_begin, pnum,
- map_count, nodeid_begin);
- /* new start, update count etc*/
- nodeid_begin = nodeid;
- pnum_begin = pnum;
- map_count = 1;
- }
- /* ok, last chunk */
- alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
- map_count, nodeid_begin);
-}
-
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -527,14 +472,15 @@ void __init sparse_init(void)
unsigned long *usemap;
unsigned long **usemap_map;
int size;
+ int nodeid_begin = 0;
+ unsigned long pnum_begin = 0;
+ unsigned long usemap_count;
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ unsigned long map_count;
int size2;
struct page **map_map;
#endif
- /* see include/linux/mmzone.h 'struct mem_section' definition */
- BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
-
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
@@ -550,19 +496,85 @@ void __init sparse_init(void)
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = memblock_virt_alloc(size, 0);
+ usemap_map = alloc_bootmem(size);
if (!usemap_map)
panic("can not allocate usemap_map\n");
- alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
- (void *)usemap_map);
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ usemap_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ usemap_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
+ usemap_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ usemap_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
+ usemap_count, nodeid_begin);
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = memblock_virt_alloc(size2, 0);
+ map_map = alloc_bootmem(size2);
if (!map_map)
panic("can not allocate map_map\n");
- alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
- (void *)map_map);
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ map_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ map_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+ map_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ map_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+ map_count, nodeid_begin);
#endif
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
@@ -588,39 +600,31 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- memblock_free_early(__pa(map_map), size2);
+ free_bootmem(__pa(map_map), size2);
#endif
- memblock_free_early(__pa(usemap_map), size);
+ free_bootmem(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
{
/* This will make the necessary allocations eventually. */
return sparse_mem_map_populate(pnum, nid);
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
-
- vmemmap_free(start, end);
+ return; /* XXX: Not implemented yet */
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
-
- vmemmap_free(start, end);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(void)
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
@@ -638,30 +642,27 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
{
- return __kmalloc_section_memmap();
+ return __kmalloc_section_memmap(nr_pages);
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
+ get_order(sizeof(struct page) * nr_pages));
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- unsigned long magic, nr_pages;
+ unsigned long magic;
struct page *page = virt_to_page(memmap);
- nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
- >> PAGE_SHIFT;
-
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -682,15 +683,47 @@ static void free_map_bootmem(struct page *memmap)
put_page_bootmem(page);
}
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+ struct page *usemap_page;
+ unsigned long nr_pages;
+
+ if (!usemap)
+ return;
+
+ usemap_page = virt_to_page(usemap);
+ /*
+ * Check to see if allocation came from hot-plug-add
+ */
+ if (PageSlab(usemap_page)) {
+ kfree(usemap);
+ if (memmap)
+ __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+ return;
+ }
+
+ /*
+ * The usemap came from bootmem. This is packed with other usemaps
+ * on the section which has pgdat at boot time. Just keep it as is now.
+ */
+
+ if (memmap) {
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ free_map_bootmem(memmap, nr_pages);
+ }
+}
+
/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -707,12 +740,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
if (!usemap) {
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, nr_pages);
return -ENOMEM;
}
@@ -724,7 +757,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
goto out;
}
- memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
+ memset(memmap, 0, sizeof(struct page) * nr_pages);
ms->section_mem_map |= SECTION_MARKED_PRESENT;
@@ -734,12 +767,11 @@ out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
kfree(usemap);
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, nr_pages);
}
return ret;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
@@ -750,7 +782,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
for (i = 0; i < PAGES_PER_SECTION; i++) {
if (PageHWPoison(&memmap[i])) {
- atomic_long_sub(1, &num_poisoned_pages);
+ atomic_long_sub(1, &mce_bad_pages);
ClearPageHWPoison(&memmap[i]);
}
}
@@ -761,40 +793,11 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
-{
- struct page *usemap_page;
-
- if (!usemap)
- return;
-
- usemap_page = virt_to_page(usemap);
- /*
- * Check to see if allocation came from hot-plug-add
- */
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
- kfree(usemap);
- if (memmap)
- __kfree_section_memmap(memmap);
- return;
- }
-
- /*
- * The usemap came from bootmem. This is packed with other usemaps
- * on the section which has pgdat at boot time. Just keep it as is now.
- */
-
- if (memmap)
- free_map_bootmem(memmap);
-}
-
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
- unsigned long *usemap = NULL, flags;
- struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long *usemap = NULL;
- pgdat_resize_lock(pgdat, &flags);
if (ms->section_mem_map) {
usemap = ms->pageblock_flags;
memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -802,10 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
ms->section_mem_map = 0;
ms->pageblock_flags = NULL;
}
- pgdat_resize_unlock(pgdat, &flags);
clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
free_section_usemap(memmap, usemap);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif
diff --git a/mm/swap.c b/mm/swap.c
index 9e8e3472248..6310dc2008f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,17 +30,13 @@
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
-#include <linux/uio.h>
#include "internal.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/pagemap.h>
-
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
@@ -57,7 +53,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
+ VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -67,7 +63,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, false);
+ free_hot_cold_page(page, 0);
}
static void __put_compound_page(struct page *page)
@@ -79,185 +75,86 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
+static void put_compound_page(struct page *page)
{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = compound_trans_head(page);
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
+ if (likely(page != page_head &&
+ get_page_unless_zero(page_head))) {
+ unsigned long flags;
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
+ /*
+ * THP can not break up slab pages so avoid taking
+ * compound_lock(). Slab performs non-atomic bit ops
+ * on page->flags for better performance. In particular
+ * slab_unlock() in slub used to be a hot path. It is
+ * still hot on arches that do not support
+ * this_cpu_cmpxchg_double().
+ */
+ if (PageSlab(page_head)) {
+ if (PageTail(page)) {
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+
+ atomic_dec(&page->_mapcount);
+ goto skip_lock_tail;
+ } else
+ goto skip_lock;
+ }
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+skip_lock:
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on
+ * the compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(page_mapcount(page) <= 0);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
+
+skip_lock_tail:
if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
if (PageHead(page_head))
__put_compound_page(page_head);
else
__put_single_page(page_head);
}
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != page->first_page, page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
}
- return;
+ } else if (put_page_testzero(page)) {
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
}
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head_by_tail(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
}
void put_page(struct page *page)
@@ -284,37 +181,22 @@ bool __get_page_tail(struct page *page)
* split_huge_page().
*/
unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
+ bool got = false;
+ struct page *page_head = compound_trans_head(page);
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+
+ /* Ref to put_compound_page() comment. */
+ if (PageSlab(page_head)) {
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ return true;
+ } else {
+ put_page(page_head);
+ return false;
+ }
}
- }
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -473,7 +355,7 @@ void rotate_reclaimable_page(struct page *page)
page_cache_get(page);
local_irq_save(flags);
- pvec = this_cpu_ptr(&lru_rotate_pvecs);
+ pvec = &__get_cpu_var(lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
pagevec_move_tail(pvec);
local_irq_restore(flags);
@@ -501,7 +383,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
SetPageActive(page);
lru += LRU_ACTIVE;
add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_activate(page, page_to_pfn(page));
__count_vm_event(PGACTIVATE);
update_page_reclaim_stat(lruvec, file, 1);
@@ -519,11 +400,6 @@ static void activate_page_drain(int cpu)
pagevec_lru_move_fn(pvec, __activate_page, NULL);
}
-static bool need_activate_page_drain(int cpu)
-{
- return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
-}
-
void activate_page(struct page *page)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -541,11 +417,6 @@ static inline void activate_page_drain(int cpu)
{
}
-static bool need_activate_page_drain(int cpu)
-{
- return false;
-}
-
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -556,33 +427,6 @@ void activate_page(struct page *page)
}
#endif
-static void __lru_cache_activate_page(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
- int i;
-
- /*
- * Search backwards on the optimistic assumption that the page being
- * activated has just been added to this pagevec. Note that only
- * the local pagevec is examined as a !PageLRU page could be in the
- * process of being released, reclaimed, migrated or on a remote
- * pagevec that is currently being drained. Furthermore, marking
- * a remote pagevec's page PageActive potentially hits a race where
- * a page is marked PageActive just after it is added to the inactive
- * list causing accounting errors and BUG_ON checks to trigger.
- */
- for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
- struct page *pagevec_page = pvec->pages[i];
-
- if (pagevec_page == page) {
- SetPageActive(page);
- break;
- }
- }
-
- put_cpu_var(lru_add_pvec);
-}
-
/*
* Mark a page as having seen activity.
*
@@ -593,21 +437,9 @@ static void __lru_cache_activate_page(struct page *page)
void mark_page_accessed(struct page *page)
{
if (!PageActive(page) && !PageUnevictable(page) &&
- PageReferenced(page)) {
-
- /*
- * If the page is on the LRU, queue it for activation via
- * activate_page_pvecs. Otherwise, assume the page is on a
- * pagevec, mark it active and it'll be moved to the active
- * LRU on the next drain.
- */
- if (PageLRU(page))
- activate_page(page);
- else
- __lru_cache_activate_page(page);
+ PageReferenced(page) && PageLRU(page)) {
+ activate_page(page);
ClearPageReferenced(page);
- if (page_is_file_cache(page))
- workingset_activation(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
@@ -615,60 +447,42 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/*
- * Used to mark_page_accessed(page) that is not visible yet and when it is
- * still safe to use non-atomic ops
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
*/
-void init_page_accessed(struct page *page)
+void __lru_cache_add(struct page *page, enum lru_list lru)
{
- if (!PageReferenced(page))
- __SetPageReferenced(page);
-}
-EXPORT_SYMBOL(init_page_accessed);
-
-static void __lru_cache_add(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
page_cache_get(page);
if (!pagevec_space(pvec))
- __pagevec_lru_add(pvec);
+ __pagevec_lru_add(pvec, lru);
pagevec_add(pvec, page);
- put_cpu_var(lru_add_pvec);
+ put_cpu_var(lru_add_pvecs);
}
+EXPORT_SYMBOL(__lru_cache_add);
/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
+ * lru_cache_add_lru - add a page to a page list
+ * @page: the page to be added to the LRU.
+ * @lru: the LRU list to which the page is added.
*/
-void lru_cache_add_anon(struct page *page)
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
{
- if (PageActive(page))
+ if (PageActive(page)) {
+ VM_BUG_ON(PageUnevictable(page));
ClearPageActive(page);
- __lru_cache_add(page);
-}
+ } else if (PageUnevictable(page)) {
+ VM_BUG_ON(PageActive(page));
+ ClearPageUnevictable(page);
+ }
-void lru_cache_add_file(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-EXPORT_SYMBOL(lru_cache_add_file);
-
-/**
- * lru_cache_add - add a page to a page list
- * @page: the page to be added to the LRU.
- *
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
- */
-void lru_cache_add(struct page *page)
-{
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- __lru_cache_add(page);
+ VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
+ __lru_cache_add(page, lru);
}
/**
@@ -688,7 +502,6 @@ void add_page_to_unevictable_list(struct page *page)
spin_lock_irq(&zone->lru_lock);
lruvec = mem_cgroup_page_lruvec(page, zone);
- ClearPageActive(page);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
@@ -769,10 +582,15 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
+ struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+ struct pagevec *pvec;
+ int lru;
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
+ for_each_lru(lru) {
+ pvec = &pvecs[lru - LRU_BASE];
+ if (pagevec_count(pvec))
+ __pagevec_lru_add(pvec, lru);
+ }
pvec = &per_cpu(lru_rotate_pvecs, cpu);
if (pagevec_count(pvec)) {
@@ -828,36 +646,12 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_drain();
}
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
-void lru_add_drain_all(void)
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
{
- static DEFINE_MUTEX(lock);
- static struct cpumask has_work;
- int cpu;
-
- mutex_lock(&lock);
- get_online_cpus();
- cpumask_clear(&has_work);
-
- for_each_online_cpu(cpu) {
- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
-
- if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
- need_activate_page_drain(cpu)) {
- INIT_WORK(work, lru_add_drain_per_cpu);
- schedule_work_on(cpu, work);
- cpumask_set_cpu(cpu, &has_work);
- }
- }
-
- for_each_cpu(cpu, &has_work)
- flush_work(&per_cpu(lru_add_drain_work, cpu));
-
- put_online_cpus();
- mutex_unlock(&lock);
+ return schedule_on_each_cpu(lru_add_drain_per_cpu);
}
/*
@@ -873,7 +667,7 @@ void lru_add_drain_all(void)
* grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
* will free it.
*/
-void release_pages(struct page **pages, int nr, bool cold)
+void release_pages(struct page **pages, int nr, int cold)
{
int i;
LIST_HEAD(pages_to_free);
@@ -908,14 +702,11 @@ void release_pages(struct page **pages, int nr, bool cold)
}
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
+ VM_BUG_ON(!PageLRU(page));
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
-
list_add(&page->lru, &pages_to_free);
}
if (zone)
@@ -946,26 +737,37 @@ EXPORT_SYMBOL(__pagevec_release);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* used by __split_huge_page_refcount() */
void lru_add_page_tail(struct page *page, struct page *page_tail,
- struct lruvec *lruvec, struct list_head *list)
+ struct lruvec *lruvec)
{
+ int uninitialized_var(active);
+ enum lru_list lru;
const int file = 0;
- VM_BUG_ON_PAGE(!PageHead(page), page);
- VM_BUG_ON_PAGE(PageCompound(page_tail), page);
- VM_BUG_ON_PAGE(PageLRU(page_tail), page);
+ VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON(PageCompound(page_tail));
+ VM_BUG_ON(PageLRU(page_tail));
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
- if (!list)
- SetPageLRU(page_tail);
+ SetPageLRU(page_tail);
+
+ if (page_evictable(page_tail)) {
+ if (PageActive(page)) {
+ SetPageActive(page_tail);
+ active = 1;
+ lru = LRU_ACTIVE_ANON;
+ } else {
+ active = 0;
+ lru = LRU_INACTIVE_ANON;
+ }
+ } else {
+ SetPageUnevictable(page_tail);
+ lru = LRU_UNEVICTABLE;
+ }
if (likely(PageLRU(page)))
list_add_tail(&page_tail->lru, &page->lru);
- else if (list) {
- /* page reclaim is reclaiming a huge page */
- get_page(page_tail);
- list_add_tail(&page_tail->lru, list);
- } else {
+ else {
struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
@@ -974,91 +776,45 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
* Use the standard add function to put page_tail on the list,
* but then correct its position so they all end up in order.
*/
- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
+ add_page_to_lru_list(page_tail, lruvec, lru);
list_head = page_tail->lru.prev;
list_move_tail(&page_tail->lru, list_head);
}
if (!PageUnevictable(page))
- update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
+ update_page_reclaim_stat(lruvec, file, active);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- int file = page_is_file_cache(page);
- int active = PageActive(page);
- enum lru_list lru = page_lru(page);
+ enum lru_list lru = (enum lru_list)arg;
+ int file = is_file_lru(lru);
+ int active = is_active_lru(lru);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
+ if (active)
+ SetPageActive(page);
add_page_to_lru_list(page, lruvec, lru);
update_page_reclaim_stat(lruvec, file, active);
- trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
}
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
-void __pagevec_lru_add(struct pagevec *pvec)
-{
- pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
-}
-EXPORT_SYMBOL(__pagevec_lru_add);
-
-/**
- * pagevec_lookup_entries - gang pagecache lookup
- * @pvec: Where the resulting entries are placed
- * @mapping: The address_space to search
- * @start: The starting entry index
- * @nr_entries: The maximum number of entries
- * @indices: The cache indices corresponding to the entries in @pvec
- *
- * pagevec_lookup_entries() will search for and return a group of up
- * to @nr_entries pages and shadow entries in the mapping. All
- * entries are placed in @pvec. pagevec_lookup_entries() takes a
- * reference against actual pages in @pvec.
- *
- * The search returns a group of mapping-contiguous entries with
- * ascending indexes. There may be holes in the indices due to
- * not-present entries.
- *
- * pagevec_lookup_entries() returns the number of entries which were
- * found.
- */
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
- struct address_space *mapping,
- pgoff_t start, unsigned nr_pages,
- pgoff_t *indices)
+void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
- pvec->nr = find_get_entries(mapping, start, nr_pages,
- pvec->pages, indices);
- return pagevec_count(pvec);
-}
+ VM_BUG_ON(is_unevictable_lru(lru));
-/**
- * pagevec_remove_exceptionals - pagevec exceptionals pruning
- * @pvec: The pagevec to prune
- *
- * pagevec_lookup_entries() fills both pages and exceptional radix
- * tree entries into the pagevec. This function prunes all
- * exceptionals from @pvec without leaving holes, so that it can be
- * passed on to page-only pagevec operations.
- */
-void pagevec_remove_exceptionals(struct pagevec *pvec)
-{
- int i, j;
-
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
- pvec->pages[j++] = page;
- }
- pvec->nr = j;
+ pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
}
+EXPORT_SYMBOL(__pagevec_lru_add);
/**
* pagevec_lookup - gang pagecache lookup
@@ -1099,15 +855,9 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
void __init swap_setup(void)
{
unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-#ifdef CONFIG_SWAP
- int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
- for (i = 0; i < MAX_SWAPFILES; i++) {
- spin_lock_init(&swapper_spaces[i].tree_lock);
- INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
- }
+#ifdef CONFIG_SWAP
+ bdi_init(swapper_space.backing_dev_info);
#endif
/* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2972eee184a..0cb36fb1f61 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
};
-struct address_space swapper_spaces[MAX_SWAPFILES] = {
- [0 ... MAX_SWAPFILES - 1] = {
- .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
- .a_ops = &swap_aops,
- .backing_dev_info = &swap_backing_dev_info,
- }
+struct address_space swapper_space = {
+ .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+ .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
+ .a_ops = &swap_aops,
+ .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+ .backing_dev_info = &swap_backing_dev_info,
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,26 +53,13 @@ static struct {
unsigned long find_total;
} swap_cache_info;
-unsigned long total_swapcache_pages(void)
-{
- int i;
- unsigned long ret = 0;
-
- for (i = 0; i < MAX_SWAPFILES; i++)
- ret += swapper_spaces[i].nrpages;
- return ret;
-}
-
-static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
-
void show_swap_cache_info(void)
{
- printk("%lu pages in swap cache\n", total_swapcache_pages());
+ printk("%lu pages in swap cache\n", total_swapcache_pages);
printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
swap_cache_info.find_success, swap_cache_info.find_total);
- printk("Free swap = %ldkB\n",
- get_nr_swap_pages() << (PAGE_SHIFT - 10));
+ printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
@@ -80,29 +67,26 @@ void show_swap_cache_info(void)
* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
- struct address_space *address_space;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapCache(page));
+ VM_BUG_ON(!PageSwapBacked(page));
page_cache_get(page);
SetPageSwapCache(page);
set_page_private(page, entry.val);
- address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
- error = radix_tree_insert(&address_space->page_tree,
- entry.val, page);
+ spin_lock_irq(&swapper_space.tree_lock);
+ error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
if (likely(!error)) {
- address_space->nrpages++;
+ total_swapcache_pages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total);
}
- spin_unlock_irq(&address_space->tree_lock);
+ spin_unlock_irq(&swapper_space.tree_lock);
if (unlikely(error)) {
/*
@@ -124,7 +108,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
- error = radix_tree_maybe_preload(gfp_mask);
+ error = radix_tree_preload(gfp_mask);
if (!error) {
error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
@@ -138,19 +122,14 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
- struct address_space *address_space;
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON(PageWriteback(page));
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- VM_BUG_ON_PAGE(PageWriteback(page), page);
-
- entry.val = page_private(page);
- address_space = swap_address_space(entry);
- radix_tree_delete(&address_space->page_tree, page_private(page));
+ radix_tree_delete(&swapper_space.page_tree, page_private(page));
set_page_private(page, 0);
ClearPageSwapCache(page);
- address_space->nrpages--;
+ total_swapcache_pages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
}
@@ -162,20 +141,20 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page *page, struct list_head *list)
+int add_to_swap(struct page *page)
{
swp_entry_t entry;
int err;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageUptodate(page));
entry = get_swap_page();
if (!entry.val)
return 0;
if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page_to_list(page, list))) {
+ if (unlikely(split_huge_page(page))) {
swapcache_free(entry, NULL);
return 0;
}
@@ -216,14 +195,12 @@ int add_to_swap(struct page *page, struct list_head *list)
void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
- struct address_space *address_space;
entry.val = page_private(page);
- address_space = swap_address_space(entry);
- spin_lock_irq(&address_space->tree_lock);
+ spin_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page);
- spin_unlock_irq(&address_space->tree_lock);
+ spin_unlock_irq(&swapper_space.tree_lock);
swapcache_free(entry, page);
page_cache_release(page);
@@ -270,7 +247,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, todo, false);
+ release_pages(pagep, todo, 0);
pagep += todo;
nr -= todo;
}
@@ -286,13 +263,10 @@ struct page * lookup_swap_cache(swp_entry_t entry)
{
struct page *page;
- page = find_get_page(swap_address_space(entry), entry.val);
+ page = find_get_page(&swapper_space, entry.val);
- if (page) {
+ if (page)
INC_CACHE_INFO(find_success);
- if (TestClearPageReadahead(page))
- atomic_inc(&swapin_readahead_hits);
- }
INC_CACHE_INFO(find_total);
return page;
@@ -316,8 +290,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swap_address_space(entry),
- entry.val);
+ found_page = find_get_page(&swapper_space, entry.val);
if (found_page)
break;
@@ -333,7 +306,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* call radix_tree_preload() while we can wait.
*/
- err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
+ err = radix_tree_preload(gfp_mask & GFP_KERNEL);
if (err)
break;
@@ -341,24 +314,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) {
+ if (err == -EEXIST) { /* seems racy */
radix_tree_preload_end();
- /*
- * We might race against get_swap_page() and stumble
- * across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet, while
- * the other end is scheduled away waiting on discard
- * I/O completion at scan_swap_map().
- *
- * In order to avoid turning this transitory state
- * into a permanent loop around this -EEXIST case
- * if !CONFIG_PREEMPT and the I/O completion happens
- * to be waiting on the CPU waitqueue where we are now
- * busy looping, we just conditionally invoke the
- * scheduler here, if there are some more important
- * tasks to run.
- */
- cond_resched();
continue;
}
if (err) { /* swp entry is obsolete ? */
@@ -394,50 +351,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
-static unsigned long swapin_nr_pages(unsigned long offset)
-{
- static unsigned long prev_offset;
- unsigned int pages, max_pages, last_ra;
- static atomic_t last_readahead_pages;
-
- max_pages = 1 << ACCESS_ONCE(page_cluster);
- if (max_pages <= 1)
- return 1;
-
- /*
- * This heuristic has been found to work well on both sequential and
- * random loads, swapping to hard disk or to SSD: please don't ask
- * what the "+ 2" means, it just happens to work well, that's all.
- */
- pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
- if (pages == 2) {
- /*
- * We can have no readahead hits to judge by: but must not get
- * stuck here forever, so check for an adjacent offset instead
- * (and don't even bother to check whether swap type is same).
- */
- if (offset != prev_offset + 1 && offset != prev_offset - 1)
- pages = 1;
- prev_offset = offset;
- } else {
- unsigned int roundup = 4;
- while (roundup < pages)
- roundup <<= 1;
- pages = roundup;
- }
-
- if (pages > max_pages)
- pages = max_pages;
-
- /* Don't shrink readahead too fast */
- last_ra = atomic_read(&last_readahead_pages) / 2;
- if (pages < last_ra)
- pages = last_ra;
- atomic_set(&last_readahead_pages, pages);
-
- return pages;
-}
-
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -461,16 +374,11 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- unsigned long entry_offset = swp_offset(entry);
- unsigned long offset = entry_offset;
+ unsigned long offset = swp_offset(entry);
unsigned long start_offset, end_offset;
- unsigned long mask;
+ unsigned long mask = (1UL << page_cluster) - 1;
struct blk_plug plug;
- mask = swapin_nr_pages(offset) - 1;
- if (!mask)
- goto skip;
-
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -484,13 +392,10 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
- if (offset != entry_offset)
- SetPageReadahead(page);
page_cache_release(page);
}
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
-skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4c524f7bd0b..e97a0e5aea9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,8 +47,7 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
-atomic_long_t nr_swap_pages;
-/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
+long nr_swap_pages;
long total_swap_pages;
static int least_priority;
@@ -57,26 +56,7 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-/*
- * all active swap_info_structs
- * protected with swap_lock, and ordered by priority.
- */
-PLIST_HEAD(swap_active_head);
-
-/*
- * all available (active, not full) swap_info_structs
- * protected with swap_avail_lock, ordered by priority.
- * This is used by get_swap_page() instead of swap_active_head
- * because swap_active_head includes all swap_info_structs,
- * but get_swap_page() doesn't need to look at full ones.
- * This uses its own lock instead of swap_lock because when a
- * swap_info_struct changes between not-full/full, it needs to
- * add/remove itself to/from this list, but the swap_info_struct->lock
- * is held and the locking order requires swap_lock to be taken
- * before any swap_info_struct->lock.
- */
-static PLIST_HEAD(swap_avail_head);
-static DEFINE_SPINLOCK(swap_avail_lock);
+struct swap_list_t swap_list = {-1, -1};
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -99,7 +79,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), entry.val);
+ page = find_get_page(&swapper_space, entry.val);
if (!page)
return 0;
/*
@@ -193,296 +173,14 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
-#define SWAPFILE_CLUSTER 256
-#define LATENCY_LIMIT 256
-
-static inline void cluster_set_flag(struct swap_cluster_info *info,
- unsigned int flag)
-{
- info->flags = flag;
-}
-
-static inline unsigned int cluster_count(struct swap_cluster_info *info)
-{
- return info->data;
-}
-
-static inline void cluster_set_count(struct swap_cluster_info *info,
- unsigned int c)
-{
- info->data = c;
-}
-
-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
- unsigned int c, unsigned int f)
-{
- info->flags = f;
- info->data = c;
-}
-
-static inline unsigned int cluster_next(struct swap_cluster_info *info)
-{
- return info->data;
-}
-
-static inline void cluster_set_next(struct swap_cluster_info *info,
- unsigned int n)
-{
- info->data = n;
-}
-
-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
- unsigned int n, unsigned int f)
-{
- info->flags = f;
- info->data = n;
-}
-
-static inline bool cluster_is_free(struct swap_cluster_info *info)
-{
- return info->flags & CLUSTER_FLAG_FREE;
-}
-
-static inline bool cluster_is_null(struct swap_cluster_info *info)
-{
- return info->flags & CLUSTER_FLAG_NEXT_NULL;
-}
-
-static inline void cluster_set_null(struct swap_cluster_info *info)
-{
- info->flags = CLUSTER_FLAG_NEXT_NULL;
- info->data = 0;
-}
-
-/* Add a cluster to discard list and schedule it to do discard */
-static void swap_cluster_schedule_discard(struct swap_info_struct *si,
- unsigned int idx)
-{
- /*
- * If scan_swap_map() can't find a free cluster, it will check
- * si->swap_map directly. To make sure the discarding cluster isn't
- * taken by scan_swap_map(), mark the swap entries bad (occupied). It
- * will be cleared after discard
- */
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- SWAP_MAP_BAD, SWAPFILE_CLUSTER);
-
- if (cluster_is_null(&si->discard_cluster_head)) {
- cluster_set_next_flag(&si->discard_cluster_head,
- idx, 0);
- cluster_set_next_flag(&si->discard_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail = cluster_next(&si->discard_cluster_tail);
- cluster_set_next(&si->cluster_info[tail], idx);
- cluster_set_next_flag(&si->discard_cluster_tail,
- idx, 0);
- }
-
- schedule_work(&si->discard_work);
-}
-
-/*
- * Doing discard actually. After a cluster discard is finished, the cluster
- * will be added to free cluster list. caller should hold si->lock.
-*/
-static void swap_do_scheduled_discard(struct swap_info_struct *si)
-{
- struct swap_cluster_info *info;
- unsigned int idx;
-
- info = si->cluster_info;
-
- while (!cluster_is_null(&si->discard_cluster_head)) {
- idx = cluster_next(&si->discard_cluster_head);
-
- cluster_set_next_flag(&si->discard_cluster_head,
- cluster_next(&info[idx]), 0);
- if (cluster_next(&si->discard_cluster_tail) == idx) {
- cluster_set_null(&si->discard_cluster_head);
- cluster_set_null(&si->discard_cluster_tail);
- }
- spin_unlock(&si->lock);
-
- discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
- SWAPFILE_CLUSTER);
-
- spin_lock(&si->lock);
- cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&si->free_cluster_head)) {
- cluster_set_next_flag(&si->free_cluster_head,
- idx, 0);
- cluster_set_next_flag(&si->free_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail;
-
- tail = cluster_next(&si->free_cluster_tail);
- cluster_set_next(&info[tail], idx);
- cluster_set_next_flag(&si->free_cluster_tail,
- idx, 0);
- }
- memset(si->swap_map + idx * SWAPFILE_CLUSTER,
- 0, SWAPFILE_CLUSTER);
- }
-}
-
-static void swap_discard_work(struct work_struct *work)
-{
- struct swap_info_struct *si;
-
- si = container_of(work, struct swap_info_struct, discard_work);
-
- spin_lock(&si->lock);
- swap_do_scheduled_discard(si);
- spin_unlock(&si->lock);
-}
-
-/*
- * The cluster corresponding to page_nr will be used. The cluster will be
- * removed from free cluster list and its usage counter will be increased.
- */
-static void inc_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
-{
- unsigned long idx = page_nr / SWAPFILE_CLUSTER;
-
- if (!cluster_info)
- return;
- if (cluster_is_free(&cluster_info[idx])) {
- VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
- cluster_set_next_flag(&p->free_cluster_head,
- cluster_next(&cluster_info[idx]), 0);
- if (cluster_next(&p->free_cluster_tail) == idx) {
- cluster_set_null(&p->free_cluster_tail);
- cluster_set_null(&p->free_cluster_head);
- }
- cluster_set_count_flag(&cluster_info[idx], 0, 0);
- }
-
- VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
- cluster_set_count(&cluster_info[idx],
- cluster_count(&cluster_info[idx]) + 1);
-}
-
-/*
- * The cluster corresponding to page_nr decreases one usage. If the usage
- * counter becomes 0, which means no page in the cluster is in using, we can
- * optionally discard the cluster and add it to free cluster list.
- */
-static void dec_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
-{
- unsigned long idx = page_nr / SWAPFILE_CLUSTER;
-
- if (!cluster_info)
- return;
-
- VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
- cluster_set_count(&cluster_info[idx],
- cluster_count(&cluster_info[idx]) - 1);
-
- if (cluster_count(&cluster_info[idx]) == 0) {
- /*
- * If the swap is discardable, prepare discard the cluster
- * instead of free it immediately. The cluster will be freed
- * after discard.
- */
- if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
- (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
- swap_cluster_schedule_discard(p, idx);
- return;
- }
-
- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&p->free_cluster_head)) {
- cluster_set_next_flag(&p->free_cluster_head, idx, 0);
- cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
- } else {
- unsigned int tail = cluster_next(&p->free_cluster_tail);
- cluster_set_next(&cluster_info[tail], idx);
- cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
- }
- }
-}
-
-/*
- * It's possible scan_swap_map() uses a free cluster in the middle of free
- * cluster list. Avoiding such abuse to avoid list corruption.
- */
-static bool
-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
- unsigned long offset)
+static int wait_for_discard(void *word)
{
- struct percpu_cluster *percpu_cluster;
- bool conflict;
-
- offset /= SWAPFILE_CLUSTER;
- conflict = !cluster_is_null(&si->free_cluster_head) &&
- offset != cluster_next(&si->free_cluster_head) &&
- cluster_is_free(&si->cluster_info[offset]);
-
- if (!conflict)
- return false;
-
- percpu_cluster = this_cpu_ptr(si->percpu_cluster);
- cluster_set_null(&percpu_cluster->index);
- return true;
+ schedule();
+ return 0;
}
-/*
- * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
- * might involve allocating a new cluster for current CPU too.
- */
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
- unsigned long *offset, unsigned long *scan_base)
-{
- struct percpu_cluster *cluster;
- bool found_free;
- unsigned long tmp;
-
-new_cluster:
- cluster = this_cpu_ptr(si->percpu_cluster);
- if (cluster_is_null(&cluster->index)) {
- if (!cluster_is_null(&si->free_cluster_head)) {
- cluster->index = si->free_cluster_head;
- cluster->next = cluster_next(&cluster->index) *
- SWAPFILE_CLUSTER;
- } else if (!cluster_is_null(&si->discard_cluster_head)) {
- /*
- * we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them
- */
- swap_do_scheduled_discard(si);
- *scan_base = *offset = si->cluster_next;
- goto new_cluster;
- } else
- return;
- }
-
- found_free = false;
-
- /*
- * Other CPUs can use our cluster if they can't find a free cluster,
- * check if there is still free entry in the cluster
- */
- tmp = cluster->next;
- while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
- SWAPFILE_CLUSTER) {
- if (!si->swap_map[tmp]) {
- found_free = true;
- break;
- }
- tmp++;
- }
- if (!found_free) {
- cluster_set_null(&cluster->index);
- goto new_cluster;
- }
- cluster->next = tmp + 1;
- *offset = tmp;
- *scan_base = tmp;
-}
+#define SWAPFILE_CLUSTER 256
+#define LATENCY_LIMIT 256
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
@@ -491,6 +189,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned long scan_base;
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
+ int found_free_cluster = 0;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -506,27 +205,36 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
si->flags += SWP_SCANNING;
scan_base = offset = si->cluster_next;
- /* SSD algorithm */
- if (si->cluster_info) {
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
- goto checks;
- }
-
if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
}
-
- spin_unlock(&si->lock);
+ if (si->flags & SWP_DISCARDABLE) {
+ /*
+ * Start range check on racing allocations, in case
+ * they overlap the cluster we eventually decide on
+ * (we scan without swap_lock to allow preemption).
+ * It's hardly conceivable that cluster_nr could be
+ * wrapped during our scan, but don't depend on it.
+ */
+ if (si->lowest_alloc)
+ goto checks;
+ si->lowest_alloc = si->max;
+ si->highest_alloc = 0;
+ }
+ spin_unlock(&swap_lock);
/*
* If seek is expensive, start searching for new cluster from
* start of partition, to minimize the span of allocated swap.
- * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
- * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ * But if seek is cheap, search from our current position, so
+ * that swap is allocated from all over the partition: if the
+ * Flash Translation Layer only remaps within limited zones,
+ * we don't want to wear out the first zone too quickly.
*/
- scan_base = offset = si->lowest_bit;
+ if (!(si->flags & SWP_SOLIDSTATE))
+ scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
/* Locate the first empty (unaligned) cluster */
@@ -534,10 +242,32 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ }
+
+ offset = si->lowest_bit;
+ last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+ /* Locate the first empty (unaligned) cluster */
+ for (; last_in_cluster < scan_base; offset++) {
+ if (si->swap_map[offset])
+ last_in_cluster = offset + SWAPFILE_CLUSTER;
+ else if (offset == last_in_cluster) {
+ spin_lock(&swap_lock);
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ found_free_cluster = 1;
goto checks;
}
if (unlikely(--latency_ration < 0)) {
@@ -547,15 +277,12 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
offset = scan_base;
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ si->lowest_alloc = 0;
}
checks:
- if (si->cluster_info) {
- while (scan_swap_map_ssd_cluster_conflict(si, offset))
- scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
- }
if (!(si->flags & SWP_WRITEOK))
goto no_page;
if (!si->highest_bit)
@@ -566,9 +293,9 @@ checks:
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
goto checks;
@@ -586,26 +313,75 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
- inc_cluster_info_page(si, si->cluster_info, offset);
si->cluster_next = offset + 1;
si->flags -= SWP_SCANNING;
+ if (si->lowest_alloc) {
+ /*
+ * Only set when SWP_DISCARDABLE, and there's a scan
+ * for a free cluster in progress or just completed.
+ */
+ if (found_free_cluster) {
+ /*
+ * To optimize wear-levelling, discard the
+ * old data of the cluster, taking care not to
+ * discard any of its pages that have already
+ * been allocated by racing tasks (offset has
+ * already stepped over any at the beginning).
+ */
+ if (offset < si->highest_alloc &&
+ si->lowest_alloc <= last_in_cluster)
+ last_in_cluster = si->lowest_alloc - 1;
+ si->flags |= SWP_DISCARDING;
+ spin_unlock(&swap_lock);
+
+ if (offset < last_in_cluster)
+ discard_swap_cluster(si, offset,
+ last_in_cluster - offset + 1);
+
+ spin_lock(&swap_lock);
+ si->lowest_alloc = 0;
+ si->flags &= ~SWP_DISCARDING;
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+ } else if (si->flags & SWP_DISCARDING) {
+ /*
+ * Delay using pages allocated by racing tasks
+ * until the whole discard has been issued. We
+ * could defer that delay until swap_writepage,
+ * but it's easier to keep this self-contained.
+ */
+ spin_unlock(&swap_lock);
+ wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+ wait_for_discard, TASK_UNINTERRUPTIBLE);
+ spin_lock(&swap_lock);
+ } else {
+ /*
+ * Note pages allocated by racing tasks while
+ * scan for a free cluster is in progress, so
+ * that its final discard can exclude them.
+ */
+ if (offset < si->lowest_alloc)
+ si->lowest_alloc = offset;
+ if (offset > si->highest_alloc)
+ si->highest_alloc = offset;
+ }
+ }
return offset;
scan:
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
@@ -614,22 +390,21 @@ scan:
}
}
offset = si->lowest_bit;
- while (offset < scan_base) {
+ while (++offset < scan_base) {
if (!si->swap_map[offset]) {
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
}
- offset++;
}
- spin_lock(&si->lock);
+ spin_lock(&swap_lock);
no_page:
si->flags -= SWP_SCANNING;
@@ -638,87 +413,65 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si, *next;
+ struct swap_info_struct *si;
pgoff_t offset;
+ int type, next;
+ int wrapped = 0;
- if (atomic_long_read(&nr_swap_pages) <= 0)
+ spin_lock(&swap_lock);
+ if (nr_swap_pages <= 0)
goto noswap;
- atomic_long_dec(&nr_swap_pages);
-
- spin_lock(&swap_avail_lock);
-
-start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
- /* requeue si to after same-priority siblings */
- plist_requeue(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- spin_lock(&si->lock);
- if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_list)) {
- spin_unlock(&si->lock);
- goto nextsi;
- }
- WARN(!si->highest_bit,
- "swap_info %d in list but !highest_bit\n",
- si->type);
- WARN(!(si->flags & SWP_WRITEOK),
- "swap_info %d in list but !SWP_WRITEOK\n",
- si->type);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&si->lock);
- goto nextsi;
+ nr_swap_pages--;
+
+ for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+ si = swap_info[type];
+ next = si->next;
+ if (next < 0 ||
+ (!wrapped && si->prio != swap_info[next]->prio)) {
+ next = swap_list.head;
+ wrapped++;
}
+ if (!si->highest_bit)
+ continue;
+ if (!(si->flags & SWP_WRITEOK))
+ continue;
+
+ swap_list.next = next;
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
- spin_unlock(&si->lock);
- if (offset)
- return swp_entry(si->type, offset);
- pr_debug("scan_swap_map of si %d failed to find offset\n",
- si->type);
- spin_lock(&swap_avail_lock);
-nextsi:
- /*
- * if we got here, it's likely that si was almost full before,
- * and since scan_swap_map() can drop the si->lock, multiple
- * callers probably all tried to get a page from the same si
- * and it filled up before we could get one; or, the si filled
- * up between us dropping swap_avail_lock and taking si->lock.
- * Since we dropped the swap_avail_lock, the swap_avail_head
- * list may have been modified; so if next is still in the
- * swap_avail_head list then try it, otherwise start over.
- */
- if (plist_node_empty(&next->avail_list))
- goto start_over;
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ next = swap_list.next;
}
- spin_unlock(&swap_avail_lock);
-
- atomic_long_inc(&nr_swap_pages);
+ nr_swap_pages++;
noswap:
+ spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
-/* The only caller of this function is now suspend routine */
+/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
pgoff_t offset;
+ spin_lock(&swap_lock);
si = swap_info[type];
- spin_lock(&si->lock);
if (si && (si->flags & SWP_WRITEOK)) {
- atomic_long_dec(&nr_swap_pages);
+ nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
return swp_entry(type, offset);
}
- atomic_long_inc(&nr_swap_pages);
+ nr_swap_pages++;
}
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -740,20 +493,20 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;
- spin_lock(&p->lock);
+ spin_lock(&swap_lock);
return p;
bad_free:
- pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
+ printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
goto out;
bad_offset:
- pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
+ printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
+ printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
+ printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
out:
return NULL;
}
@@ -796,22 +549,14 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
/* free if no reference */
if (!usage) {
- dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit) {
- bool was_full = !p->highest_bit;
+ if (offset > p->highest_bit)
p->highest_bit = offset;
- if (was_full && (p->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&p->avail_list));
- if (plist_node_empty(&p->avail_list))
- plist_add(&p->avail_list,
- &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
- }
- atomic_long_inc(&nr_swap_pages);
+ if (swap_list.next >= 0 &&
+ p->prio > swap_info[swap_list.next]->prio)
+ swap_list.next = p->type;
+ nr_swap_pages++;
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
if (p->flags & SWP_BLKDEV) {
@@ -826,7 +571,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
/*
- * Caller has made sure that the swap device corresponding to entry
+ * Caller has made sure that the swapdevice corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
@@ -836,7 +581,7 @@ void swap_free(swp_entry_t entry)
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, entry, 1);
- spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
}
}
@@ -853,7 +598,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
if (page)
mem_cgroup_uncharge_swapcache(page, entry, count != 0);
- spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
}
}
@@ -872,7 +617,7 @@ int page_swapcount(struct page *page)
p = swap_info_get(entry);
if (p) {
count = swap_count(p->swap_map[swp_offset(entry)]);
- spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
}
return count;
}
@@ -887,7 +632,7 @@ int reuse_swap_page(struct page *page)
{
int count;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
if (unlikely(PageKsm(page)))
return 0;
count = page_mapcount(page);
@@ -907,7 +652,7 @@ int reuse_swap_page(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!PageLocked(page));
if (!PageSwapCache(page))
return 0;
@@ -928,7 +673,7 @@ int try_to_free_swap(struct page *page)
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
- * Hibernation suspends storage while it is writing the image
+ * Hibration suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
@@ -954,14 +699,13 @@ int free_swap_and_cache(swp_entry_t entry)
p = swap_info_get(entry);
if (p) {
if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
- page = find_get_page(swap_address_space(entry),
- entry.val);
+ page = find_get_page(&swapper_space, entry.val);
if (page && !trylock_page(page)) {
page_cache_release(page);
page = NULL;
}
}
- spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
}
if (page) {
/*
@@ -1059,34 +803,17 @@ unsigned int count_swap_pages(int type, int free)
if ((unsigned int)type < nr_swapfiles) {
struct swap_info_struct *sis = swap_info[type];
- spin_lock(&sis->lock);
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
n -= sis->inuse_pages;
}
- spin_unlock(&sis->lock);
}
spin_unlock(&swap_lock);
return n;
}
#endif /* CONFIG_HIBERNATION */
-static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
-{
-#ifdef CONFIG_MEM_SOFT_DIRTY
- /*
- * When pte keeps soft dirty bit the pte generated
- * from swap entry does not has it, still it's same
- * pte from logical point of view.
- */
- pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
- return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
-#else
- return pte_same(pte, swp_pte);
-#endif
-}
-
/*
* No need to decide whether this PTE shares the swap entry with others,
* just let do_wp_page work it out if a write is requested later - to
@@ -1095,17 +822,11 @@ static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- struct page *swapcache;
struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
- swapcache = page;
- page = ksm_might_need_to_copy(page, vma, addr);
- if (unlikely(!page))
- return -ENOMEM;
-
if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
GFP_KERNEL, &memcg)) {
ret = -ENOMEM;
@@ -1113,7 +834,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
+ if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
mem_cgroup_cancel_charge_swapin(memcg);
ret = 0;
goto out;
@@ -1124,10 +845,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
- if (page == swapcache)
- page_add_anon_rmap(page, vma, addr);
- else /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
+ page_add_anon_rmap(page, vma, addr);
mem_cgroup_commit_charge_swapin(page, memcg);
swap_free(entry);
/*
@@ -1138,10 +856,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
out:
pte_unmap_unlock(pte, ptl);
out_nolock:
- if (page != swapcache) {
- unlock_page(page);
- put_page(page);
- }
return ret;
}
@@ -1160,7 +874,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
* of unmatched parts which look like swp_pte, so unuse_pte must
* recheck under pte lock. Scanning without pte lock lets it be
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
*/
pte = pte_offset_map(pmd, addr);
do {
@@ -1168,7 +882,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
- if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+ if (unlikely(pte_same(*pte, swp_pte))) {
pte_unmap(pte);
ret = unuse_pte(vma, pmd, addr, entry, page);
if (ret)
@@ -1313,7 +1027,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
else
continue;
}
- count = ACCESS_ONCE(si->swap_map[i]);
+ count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
@@ -1333,11 +1047,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
- volatile unsigned char *swap_map; /* swap_map is accessed without
- * locking. Mark it as volatile
- * to prevent compiler doing
- * something odd.
- */
+ unsigned char *swap_map;
unsigned char swcount;
struct page *page;
swp_entry_t entry;
@@ -1388,15 +1098,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
* reused since sys_swapoff() already disabled
* allocation from here, or alloc_page() failed.
*/
- swcount = *swap_map;
- /*
- * We don't hold lock here, so the swap entry could be
- * SWAP_MAP_BAD (when the cluster is discarding).
- * Instead of fail out, We can just skip the swap
- * entry because swapoff will wait for discarding
- * finish anyway.
- */
- if (!swcount || swcount == SWAP_MAP_BAD)
+ if (!*swap_map)
continue;
retval = -ENOMEM;
break;
@@ -1743,60 +1445,48 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+ unsigned long *frontswap_map)
{
+ int i, prev;
+
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
- /*
- * the plist prio is negated because plist ordering is
- * low-to-high, while swap ordering is high-to-low
- */
- p->list.prio = -p->prio;
- p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
- p->cluster_info = cluster_info;
+ frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
- atomic_long_add(p->pages, &nr_swap_pages);
+ nr_swap_pages += p->pages;
total_swap_pages += p->pages;
- assert_spin_locked(&swap_lock);
- /*
- * both lists are plists, and thus priority ordered.
- * swap_active_head needs to be priority ordered for swapoff(),
- * which on removal of any swap_info_struct with an auto-assigned
- * (i.e. negative) priority increments the auto-assigned priority
- * of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for get_swap_page(),
- * which allocates swap pages from the highest available priority
- * swap_info_struct.
- */
- plist_add(&p->list, &swap_active_head);
- spin_lock(&swap_avail_lock);
- plist_add(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ /* insert swap space into swap_list: */
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
+ break;
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0)
+ swap_list.head = swap_list.next = p->type;
+ else
+ swap_info[prev]->next = p->type;
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
unsigned long *frontswap_map)
{
- frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
- spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
- spin_unlock(&p->lock);
+ _enable_swap_info(p, prio, swap_map, frontswap_map);
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
- spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
- spin_unlock(&p->lock);
+ _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
spin_unlock(&swap_lock);
}
@@ -1804,14 +1494,12 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
- struct swap_cluster_info *cluster_info;
- unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int err, found = 0;
- unsigned int old_block_size;
+ int i, type, prev;
+ int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1828,16 +1516,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
+ prev = -1;
spin_lock(&swap_lock);
- plist_for_each_entry(p, &swap_active_head, list) {
+ for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
+ p = swap_info[type];
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping) {
- found = 1;
+ if (p->swap_file->f_mapping == mapping)
break;
- }
}
+ prev = type;
}
- if (!found) {
+ if (type < 0) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1849,29 +1538,26 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- spin_lock(&swap_avail_lock);
- plist_del(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- spin_lock(&p->lock);
+ if (prev < 0)
+ swap_list.head = p->next;
+ else
+ swap_info[prev]->next = p->next;
+ if (type == swap_list.next) {
+ /* just pick something that's safe... */
+ swap_list.next = swap_list.head;
+ }
if (p->prio < 0) {
- struct swap_info_struct *si = p;
-
- plist_for_each_entry_continue(si, &swap_active_head, list) {
- si->prio++;
- si->list.prio--;
- si->avail_list.prio--;
- }
+ for (i = p->next; i >= 0; i = swap_info[i]->next)
+ swap_info[i]->prio = p->prio--;
least_priority++;
}
- plist_del(&p->list, &swap_active_head);
- atomic_long_sub(p->pages, &nr_swap_pages);
+ nr_swap_pages -= p->pages;
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
- spin_unlock(&p->lock);
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ err = try_to_unuse(type, false, 0); /* force all pages to be unused */
clear_current_oom_origin();
if (err) {
@@ -1880,53 +1566,40 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out_dput;
}
- flush_work(&p->discard_work);
-
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
- spin_lock(&p->lock);
drain_mmlist();
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
- spin_unlock(&p->lock);
spin_unlock(&swap_lock);
schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
- spin_lock(&p->lock);
}
swap_file = p->swap_file;
- old_block_size = p->old_block_size;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
- cluster_info = p->cluster_info;
- p->cluster_info = NULL;
- frontswap_map = frontswap_map_get(p);
- spin_unlock(&p->lock);
+ p->flags = 0;
+ frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(p->type);
- frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
vfree(swap_map);
- vfree(cluster_info);
- vfree(frontswap_map);
- /* Destroy swap account information */
- swap_cgroup_swapoff(p->type);
+ vfree(frontswap_map_get(p));
+ /* Destroy swap account informatin */
+ swap_cgroup_swapoff(type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
- set_blocksize(bdev, old_block_size);
+ set_blocksize(bdev, p->old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
@@ -1934,16 +1607,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
-
- /*
- * Clear the SWP_USED flag after all resources are freed so that swapon
- * can reuse this swap_info in alloc_swap_info() safely. It is ok to
- * not hold p->lock after we cleared its SWP_WRITEOK.
- */
- spin_lock(&swap_lock);
- p->flags = 0;
- spin_unlock(&swap_lock);
-
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
@@ -2036,7 +1699,7 @@ static int swap_show(struct seq_file *swap, void *v)
len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file_inode(file)->i_mode) ?
+ S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
"partition" : "file\t",
si->pages << (PAGE_SHIFT - 10),
si->inuse_pages << (PAGE_SHIFT - 10),
@@ -2128,11 +1791,9 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
- plist_node_init(&p->list, 0);
- plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
+ p->next = -1;
spin_unlock(&swap_lock);
- spin_lock_init(&p->lock);
return p;
}
@@ -2173,10 +1834,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
int i;
unsigned long maxpages;
unsigned long swapfilepages;
- unsigned long last_page;
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
- pr_err("Unable to find swap-space signature\n");
+ printk(KERN_ERR "Unable to find swap-space signature\n");
return 0;
}
@@ -2190,8 +1850,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
}
/* Check the swap header's sub-version */
if (swap_header->info.version != 1) {
- pr_warn("Unable to handle swap header version %d\n",
- swap_header->info.version);
+ printk(KERN_WARNING
+ "Unable to handle swap header version %d\n",
+ swap_header->info.version);
return 0;
}
@@ -2215,14 +1876,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
*/
maxpages = swp_offset(pte_to_swp_entry(
swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
- last_page = swap_header->info.last_page;
- if (last_page > maxpages) {
- pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
- maxpages << (PAGE_SHIFT - 10),
- last_page << (PAGE_SHIFT - 10));
- }
- if (maxpages > last_page) {
- maxpages = last_page + 1;
+ if (maxpages > swap_header->info.last_page) {
+ maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
if ((unsigned int)maxpages == 0)
maxpages = UINT_MAX;
@@ -2233,7 +1888,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return 0;
swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
if (swapfilepages && maxpages > swapfilepages) {
- pr_warn("Swap area shorter than signature indicates\n");
+ printk(KERN_WARNING
+ "Swap area shorter than signature indicates\n");
return 0;
}
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -2247,23 +1903,15 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
static int setup_swap_map_and_extents(struct swap_info_struct *p,
union swap_header *swap_header,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
unsigned long maxpages,
sector_t *span)
{
int i;
unsigned int nr_good_pages;
int nr_extents;
- unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
nr_good_pages = maxpages - 1; /* omit header page */
- cluster_set_null(&p->free_cluster_head);
- cluster_set_null(&p->free_cluster_tail);
- cluster_set_null(&p->discard_cluster_head);
- cluster_set_null(&p->discard_cluster_tail);
-
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2271,25 +1919,11 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
- /*
- * Haven't marked the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, page_nr);
}
}
- /* Haven't marked the cluster free yet, no list operation involved */
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(p, cluster_info, i);
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
- /*
- * Not mark the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, 0);
p->max = maxpages;
p->pages = nr_good_pages;
nr_extents = setup_swap_extents(p, span);
@@ -2298,51 +1932,13 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
nr_good_pages = p->pages;
}
if (!nr_good_pages) {
- pr_warn("Empty swap-file\n");
+ printk(KERN_WARNING "Empty swap-file\n");
return -EINVAL;
}
- if (!cluster_info)
- return nr_extents;
-
- for (i = 0; i < nr_clusters; i++) {
- if (!cluster_count(&cluster_info[idx])) {
- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- if (cluster_is_null(&p->free_cluster_head)) {
- cluster_set_next_flag(&p->free_cluster_head,
- idx, 0);
- cluster_set_next_flag(&p->free_cluster_tail,
- idx, 0);
- } else {
- unsigned int tail;
-
- tail = cluster_next(&p->free_cluster_tail);
- cluster_set_next(&cluster_info[tail], idx);
- cluster_set_next_flag(&p->free_cluster_tail,
- idx, 0);
- }
- }
- idx++;
- if (idx == nr_clusters)
- idx = 0;
- }
return nr_extents;
}
-/*
- * Helper to sys_swapon determining if a given swap
- * backing device queue supports DISCARD operations.
- */
-static bool swap_discardable(struct swap_info_struct *si)
-{
- struct request_queue *q = bdev_get_queue(si->bdev);
-
- if (!q || !blk_queue_discard(q))
- return false;
-
- return true;
-}
-
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
@@ -2357,7 +1953,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
- struct swap_cluster_info *cluster_info = NULL;
unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2372,8 +1967,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (IS_ERR(p))
return PTR_ERR(p);
- INIT_WORK(&p->discard_work, swap_discard_work);
-
name = getname(specialfile);
if (IS_ERR(name)) {
error = PTR_ERR(name);
@@ -2433,74 +2026,28 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -ENOMEM;
goto bad_swap;
}
- if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
- p->flags |= SWP_SOLIDSTATE;
- /*
- * select a random position to start with to help wear leveling
- * SSD
- */
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
-
- cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
- SWAPFILE_CLUSTER) * sizeof(*cluster_info));
- if (!cluster_info) {
- error = -ENOMEM;
- goto bad_swap;
- }
- p->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!p->percpu_cluster) {
- error = -ENOMEM;
- goto bad_swap;
- }
- for_each_possible_cpu(i) {
- struct percpu_cluster *cluster;
- cluster = per_cpu_ptr(p->percpu_cluster, i);
- cluster_set_null(&cluster->index);
- }
- }
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
goto bad_swap;
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
- cluster_info, maxpages, &span);
+ maxpages, &span);
if (unlikely(nr_extents < 0)) {
error = nr_extents;
goto bad_swap;
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (frontswap_enabled)
- frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
-
- if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
- /*
- * When discard is enabled for swap with no particular
- * policy flagged, we set all swap discard flags here in
- * order to sustain backward compatibility with older
- * swapon(8) releases.
- */
- p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
- SWP_PAGE_DISCARD);
+ frontswap_map = vzalloc(maxpages / sizeof(long));
- /*
- * By flagging sys_swapon, a sysadmin can tell us to
- * either do single-time area discards only, or to just
- * perform discards for released swap page-clusters.
- * Now it's time to adjust the p->flags accordingly.
- */
- if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
- p->flags &= ~SWP_PAGE_DISCARD;
- else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
- p->flags &= ~SWP_AREA_DISCARD;
-
- /* issue a swapon-time discard if it's still required */
- if (p->flags & SWP_AREA_DISCARD) {
- int err = discard_swap(p);
- if (unlikely(err))
- pr_err("swapon: discard_swap(%p): %d\n",
- p, err);
+ if (p->bdev) {
+ if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next = 1 + (random32() % p->highest_bit);
}
+ if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
+ p->flags |= SWP_DISCARDABLE;
}
mutex_lock(&swapon_mutex);
@@ -2508,16 +2055,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
- pr_info("Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ printk(KERN_INFO "Adding %uk swap on %s. "
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
- (p->flags & SWP_AREA_DISCARD) ? "s" : "",
- (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
(frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
@@ -2529,8 +2074,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
bad_swap:
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2542,7 +2085,6 @@ bad_swap:
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
- vfree(cluster_info);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
mutex_unlock(&inode->i_mutex);
@@ -2574,7 +2116,7 @@ void si_swapinfo(struct sysinfo *val)
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
nr_to_be_unused += si->inuse_pages;
}
- val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
+ val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
spin_unlock(&swap_lock);
}
@@ -2607,21 +2149,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
p = swap_info[type];
offset = swp_offset(entry);
- spin_lock(&p->lock);
+ spin_lock(&swap_lock);
if (unlikely(offset >= p->max))
goto unlock_out;
count = p->swap_map[offset];
-
- /*
- * swapin_readahead() doesn't check if a swap entry is valid, so the
- * swap entry could be SWAP_MAP_BAD. Check here with lock held.
- */
- if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
- err = -ENOENT;
- goto unlock_out;
- }
-
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
err = 0;
@@ -2652,12 +2184,12 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
p->swap_map[offset] = count | has_cache;
unlock_out:
- spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
out:
return err;
bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
+ printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
@@ -2711,7 +2243,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
*/
struct address_space *__page_file_mapping(struct page *page)
{
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON(!PageSwapCache(page));
return page_swap_info(page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2719,7 +2251,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
pgoff_t __page_file_index(struct page *page)
{
swp_entry_t swap = { .val = page_private(page) };
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON(!PageSwapCache(page));
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
@@ -2777,14 +2309,14 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
}
if (!page) {
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
return -ENOMEM;
}
/*
* We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel page tables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
+ * no architecture is using highmem pages for kernel pagetables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
*/
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
@@ -2825,7 +2357,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
list_add_tail(&page->lru, &head->lru);
page = NULL; /* now it's attached, don't free it */
out:
- spin_unlock(&si->lock);
+ spin_unlock(&swap_lock);
outer:
if (page)
__free_page(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 6a78c814beb..c75b736e54b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -22,51 +22,11 @@
#include <linux/cleancache.h>
#include "internal.h"
-static void clear_exceptional_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- struct radix_tree_node *node;
- void **slot;
-
- /* Handled by shmem itself */
- if (shmem_mapping(mapping))
- return;
-
- spin_lock_irq(&mapping->tree_lock);
- /*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
- */
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
- radix_tree_replace_slot(slot, NULL);
- mapping->nrshadows--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes, &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
-unlock:
- spin_unlock_irq(&mapping->tree_lock);
-}
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
- * @offset: start of the range to invalidate
- * @length: length of the range to invalidate
+ * @offset: the index of the truncation point
*
* do_invalidatepage() is called when all or part of the page has become
* invalidated by a truncate operation.
@@ -77,18 +37,24 @@ unlock:
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void do_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void do_invalidatepage(struct page *page, unsigned long offset)
{
- void (*invalidatepage)(struct page *, unsigned int, unsigned int);
-
+ void (*invalidatepage)(struct page *, unsigned long);
invalidatepage = page->mapping->a_ops->invalidatepage;
#ifdef CONFIG_BLOCK
if (!invalidatepage)
invalidatepage = block_invalidatepage;
#endif
if (invalidatepage)
- (*invalidatepage)(page, offset, length);
+ (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+ zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ cleancache_invalidate_page(page->mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, partial);
}
/*
@@ -137,7 +103,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
return -EIO;
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0);
cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -219,11 +185,11 @@ int invalidate_inode_page(struct page *page)
* truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
- * @lend: offset to which to truncate (inclusive)
+ * @lend: offset to which to truncate
*
* Truncate the page cache, removing the pages that are between
- * specified offsets (and zeroing out partial pages
- * if lstart or lend + 1 is not page aligned).
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
@@ -234,67 +200,37 @@ int invalidate_inode_page(struct page *page)
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
- *
- * Note that since ->invalidatepage() accepts range to invalidate
- * truncate_inode_pages_range is able to handle cases where lend + 1 is not
- * page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- pgoff_t start; /* inclusive */
- pgoff_t end; /* exclusive */
- unsigned int partial_start; /* inclusive */
- unsigned int partial_end; /* exclusive */
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- pgoff_t index;
- int i;
+ const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ int i;
cleancache_invalidate_inode(mapping);
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ if (mapping->nrpages == 0)
return;
- /* Offsets within partial pages */
- partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
-
- /*
- * 'start' and 'end' always covers the range of pages to be fully
- * truncated. Partial pages are covered with 'partial_start' at the
- * start of the range and 'partial_end' at the end of the range.
- * Note that 'end' is exclusive while 'lend' is inclusive.
- */
- start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (lend == -1)
- /*
- * lend == -1 indicates end-of-file so we have to set 'end'
- * to the highest possible pgoff_t and since the type is
- * unsigned we're using -1.
- */
- end = -1;
- else
- end = (lend + 1) >> PAGE_CACHE_SHIFT;
+ BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+ end = (lend >> PAGE_CACHE_SHIFT);
pagevec_init(&pvec, 0);
index = start;
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end)
+ index = page->index;
+ if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -305,65 +241,33 @@ void truncate_inode_pages_range(struct address_space *mapping,
truncate_inode_page(mapping, page);
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
index++;
}
- if (partial_start) {
+ if (partial) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
- if (start > end) {
- /* Truncation within a single page */
- top = partial_end;
- partial_end = 0;
- }
- wait_on_page_writeback(page);
- zero_user_segment(page, partial_start, top);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial_start,
- top - partial_start);
- unlock_page(page);
- page_cache_release(page);
- }
- }
- if (partial_end) {
- struct page *page = find_lock_page(mapping, end);
- if (page) {
wait_on_page_writeback(page);
- zero_user_segment(page, 0, partial_end);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, 0,
- partial_end);
+ truncate_partial_page(page, partial);
unlock_page(page);
page_cache_release(page);
}
}
- /*
- * If the truncation happened within a single page no pages
- * will be released, just zeroed, so we can bail out now.
- */
- if (start >= end)
- return;
index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
+ if (!pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
if (index == start)
break;
index = start;
continue;
}
- if (index == start && indices[0] >= end) {
- pagevec_remove_exceptionals(&pvec);
+ if (index == start && pvec.pages[0]->index > end) {
pagevec_release(&pvec);
break;
}
@@ -372,22 +276,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end)
+ index = page->index;
+ if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
lock_page(page);
WARN_ON(page->index != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -415,53 +313,6 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
EXPORT_SYMBOL(truncate_inode_pages);
/**
- * truncate_inode_pages_final - truncate *all* pages before inode dies
- * @mapping: mapping to truncate
- *
- * Called under (and serialized by) inode->i_mutex.
- *
- * Filesystems have to use this in the .evict_inode path to inform the
- * VM that this is the final truncate and the inode is going away.
- */
-void truncate_inode_pages_final(struct address_space *mapping)
-{
- unsigned long nrshadows;
- unsigned long nrpages;
-
- /*
- * Page reclaim can not participate in regular inode lifetime
- * management (can't call iput()) and thus can race with the
- * inode teardown. Tell it when the address space is exiting,
- * so that it does not install eviction information after the
- * final truncate has begun.
- */
- mapping_set_exiting(mapping);
-
- /*
- * When reclaim installs eviction entries, it increases
- * nrshadows first, then decreases nrpages. Make sure we see
- * this in the right order or we might miss an entry.
- */
- nrpages = mapping->nrpages;
- smp_rmb();
- nrshadows = mapping->nrshadows;
-
- if (nrpages || nrshadows) {
- /*
- * As truncation uses a lockless tree lookup, cycle
- * the tree lock to make sure any ongoing tree
- * modification that does not see AS_EXITING is
- * completed before starting the final truncate.
- */
- spin_lock_irq(&mapping->tree_lock);
- spin_unlock_irq(&mapping->tree_lock);
-
- truncate_inode_pages(mapping, 0);
- }
-}
-EXPORT_SYMBOL(truncate_inode_pages_final);
-
-/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
@@ -477,31 +328,32 @@ EXPORT_SYMBOL(truncate_inode_pages_final);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
+ /*
+ * Note: this function may get called on a shmem/tmpfs mapping:
+ * pagevec_lookup() might then return 0 prematurely (because it
+ * got a gangful of swap entries); but it's hardly worth worrying
+ * about - it can rarely have anything to free from such a mapping
+ * (most pages are dirty), and already skips over any difficulties.
+ */
+
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
+ index = page->index;
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
if (!trylock_page(page))
continue;
WARN_ON(page->index != index);
@@ -515,7 +367,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
deactivate_page(page);
count += ret;
}
- pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -546,7 +397,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -583,7 +434,6 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
pgoff_t index;
int i;
@@ -594,23 +444,17 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
cleancache_invalidate_inode(mapping);
pagevec_init(&pvec, 0);
index = start;
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
+ while (index <= end && pagevec_lookup(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
- index = indices[i];
+ index = page->index;
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
lock_page(page);
WARN_ON(page->index != index);
if (page->mapping != mapping) {
@@ -648,7 +492,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
ret = ret2;
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -677,6 +520,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
/**
* truncate_pagecache - unmap and remove pagecache that has been truncated
* @inode: inode
+ * @oldsize: old file size
* @newsize: new file size
*
* inode's new i_size must already be written before truncate_pagecache
@@ -689,7 +533,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
* situations such as writepage being called for a page that has already
* had its underlying blocks deallocated.
*/
-void truncate_pagecache(struct inode *inode, loff_t newsize)
+void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
{
struct address_space *mapping = inode->i_mapping;
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -723,8 +567,12 @@ EXPORT_SYMBOL(truncate_pagecache);
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
+ loff_t oldsize;
+
+ oldsize = inode->i_size;
i_size_write(inode, newsize);
- truncate_pagecache(inode, newsize);
+
+ truncate_pagecache(inode, oldsize, newsize);
}
EXPORT_SYMBOL(truncate_setsize);
@@ -750,8 +598,10 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
* This rounding is currently just for example: unmap_mapping_range
* expands its hole outwards, whereas we want it to contract the hole
* inwards. However, existing callers of truncate_pagecache_range are
- * doing their own page rounding first. Note that unmap_mapping_range
- * allows holelen 0 for all, and we allow lend -1 for end of file.
+ * doing their own page rounding first; and truncate_inode_pages_range
+ * currently BUGs if lend is not pagealigned-1 (it handles partial
+ * page at start of hole, but not partial page at end of hole). Note
+ * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
*/
/*
diff --git a/mm/util.c b/mm/util.c
index d5ea733c508..c55e26b17d9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,17 +1,10 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-#include <linux/mman.h>
-#include <linux/hugetlb.h>
-#include <linux/vmalloc.h>
-
#include <asm/uaccess.h>
#include "internal.h"
@@ -300,6 +293,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->unmap_area = arch_unmap_area;
}
#endif
@@ -309,7 +303,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* If the architecture not support this function, simply return with no
* page pinned
*/
-int __weak __get_user_pages_fast(unsigned long start,
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
@@ -340,7 +334,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
-int __weak get_user_pages_fast(unsigned long start,
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
@@ -361,16 +355,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
{
unsigned long ret;
struct mm_struct *mm = current->mm;
- unsigned long populate;
ret = security_mmap_file(file, prot, flag);
if (!ret) {
down_write(&mm->mmap_sem);
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
- &populate);
+ ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
up_write(&mm->mmap_sem);
- if (populate)
- mm_populate(ret, populate);
}
return ret;
}
@@ -388,123 +378,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-struct address_space *page_mapping(struct page *page)
-{
- struct address_space *mapping = page->mapping;
-
- /* This happens if someone calls flush_dcache_page on slab page */
- if (unlikely(PageSlab(page)))
- return NULL;
-
- if (unlikely(PageSwapCache(page))) {
- swp_entry_t entry;
-
- entry.val = page_private(page);
- mapping = swap_address_space(entry);
- } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
- mapping = NULL;
- return mapping;
-}
-
-int overcommit_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret == 0 && write)
- sysctl_overcommit_kbytes = 0;
- return ret;
-}
-
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
-
- ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
- if (ret == 0 && write)
- sysctl_overcommit_ratio = 0;
- return ret;
-}
-
-/*
- * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
- */
-unsigned long vm_commit_limit(void)
-{
- unsigned long allowed;
-
- if (sysctl_overcommit_kbytes)
- allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
- else
- allowed = ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100);
- allowed += total_swap_pages;
-
- return allowed;
-}
-
-/**
- * get_cmdline() - copy the cmdline value to a buffer.
- * @task: the task whose cmdline value to copy.
- * @buffer: the buffer to copy to.
- * @buflen: the length of the buffer. Larger cmdline values are truncated
- * to this length.
- * Returns the size of the cmdline field copied. Note that the copy does
- * not guarantee an ending NULL byte.
- */
-int get_cmdline(struct task_struct *task, char *buffer, int buflen)
-{
- int res = 0;
- unsigned int len;
- struct mm_struct *mm = get_task_mm(task);
- if (!mm)
- goto out;
- if (!mm->arg_end)
- goto out_mm; /* Shh! No looking before we're done */
-
- len = mm->arg_end - mm->arg_start;
-
- if (len > buflen)
- len = buflen;
-
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
-
- /*
- * If the nul at the end of args has been overwritten, then
- * assume application is using setproctitle(3).
- */
- if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
- len = strnlen(buffer, res);
- if (len < res) {
- res = len;
- } else {
- len = mm->env_end - mm->env_start;
- if (len > buflen - res)
- len = buflen - res;
- res += access_process_vm(task, mm->env_start,
- buffer+res, len, 0);
- res = strnlen(buffer, res);
- }
- }
-out_mm:
- mmput(mm);
-out:
- return res;
-}
-
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmacache.c b/mm/vmacache.c
deleted file mode 100644
index 9f25af825de..00000000000
--- a/mm/vmacache.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (C) 2014 Davidlohr Bueso.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-
-/*
- * Flush vma caches for threads that share a given mm.
- *
- * The operation is safe because the caller holds the mmap_sem
- * exclusively and other threads accessing the vma cache will
- * have mmap_sem held at least for read, so no extra locking
- * is required to maintain the vma cache.
- */
-void vmacache_flush_all(struct mm_struct *mm)
-{
- struct task_struct *g, *p;
-
- /*
- * Single threaded tasks need not iterate the entire
- * list of process. We can avoid the flushing as well
- * since the mm's seqnum was increased and don't have
- * to worry about other threads' seqnum. Current's
- * flush will occur upon the next lookup.
- */
- if (atomic_read(&mm->mm_users) == 1)
- return;
-
- rcu_read_lock();
- for_each_process_thread(g, p) {
- /*
- * Only flush the vmacache pointers as the
- * mm seqnum is already set and curr's will
- * be set upon invalidation when the next
- * lookup is done.
- */
- if (mm == p->mm)
- vmacache_flush(p);
- }
- rcu_read_unlock();
-}
-
-/*
- * This task may be accessing a foreign mm via (for example)
- * get_user_pages()->find_vma(). The vmacache is task-local and this
- * task's vmacache pertains to a different mm (ie, its own). There is
- * nothing we can do here.
- *
- * Also handle the case where a kernel thread has adopted this mm via use_mm().
- * That kernel thread's vmacache is not applicable to this mm.
- */
-static bool vmacache_valid_mm(struct mm_struct *mm)
-{
- return current->mm == mm && !(current->flags & PF_KTHREAD);
-}
-
-void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
-{
- if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache[VMACACHE_HASH(addr)] = newvma;
-}
-
-static bool vmacache_valid(struct mm_struct *mm)
-{
- struct task_struct *curr;
-
- if (!vmacache_valid_mm(mm))
- return false;
-
- curr = current;
- if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
- /*
- * First attempt will always be invalid, initialize
- * the new cache for this task here.
- */
- curr->vmacache_seqnum = mm->vmacache_seqnum;
- vmacache_flush(curr);
- return false;
- }
- return true;
-}
-
-struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
-{
- int i;
-
- if (!vmacache_valid(mm))
- return NULL;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache[i];
-
- if (!vma)
- continue;
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
-
- return NULL;
-}
-
-#ifndef CONFIG_MMU
-struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- int i;
-
- if (!vmacache_valid(mm))
- return NULL;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache[i];
-
- if (vma && vma->vm_start == start && vma->vm_end == end) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
-
- return NULL;
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f64632b6719..5123a169ab7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -27,32 +27,10 @@
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/llist.h>
-
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
-struct vfree_deferred {
- struct llist_head list;
- struct work_struct wq;
-};
-static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
-
-static void __vunmap(const void *, int);
-
-static void free_work(struct work_struct *w)
-{
- struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *llnode = llist_del_all(&p->list);
- while (llnode) {
- void *p = llnode;
- llnode = llist_next(llnode);
- __vunmap(p, 1);
- }
-}
-
/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -271,9 +249,19 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
+struct vmap_area {
+ unsigned long va_start;
+ unsigned long va_end;
+ unsigned long flags;
+ struct rb_node rb_node; /* address sorted rbtree */
+ struct list_head list; /* address sorted list */
+ struct list_head purge_list; /* "lazy purge" list */
+ struct vm_struct *vm;
+ struct rcu_head rcu_head;
+};
+
static DEFINE_SPINLOCK(vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
+static LIST_HEAD(vmap_area_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -294,7 +282,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
va = rb_entry(n, struct vmap_area, rb_node);
if (addr < va->va_start)
n = n->rb_left;
- else if (addr >= va->va_end)
+ else if (addr > va->va_start)
n = n->rb_right;
else
return va;
@@ -325,7 +313,7 @@ static void __insert_vmap_area(struct vmap_area *va)
rb_link_node(&va->rb_node, parent, p);
rb_insert_color(&va->rb_node, &vmap_area_root);
- /* address-sort this list */
+ /* address-sort this list so it is usable like the vmlist */
tmp = rb_prev(&va->rb_node);
if (tmp) {
struct vmap_area *prev;
@@ -361,12 +349,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
- /*
- * Only scan the relevant parts containing pointers to other objects
- * to avoid false negatives.
- */
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
-
retry:
spin_lock(&vmap_area_lock);
/*
@@ -396,12 +378,12 @@ nocache:
addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
- if (addr + size < addr)
+ if (addr + size - 1 < addr)
goto overflow;
} else {
addr = ALIGN(vstart, align);
- if (addr + size < addr)
+ if (addr + size - 1 < addr)
goto overflow;
n = vmap_area_root.rb_node;
@@ -428,7 +410,7 @@ nocache:
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
+ if (addr + size - 1 < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
@@ -760,7 +742,9 @@ struct vmap_block_queue {
struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
+ struct vmap_block_queue *vbq;
unsigned long free, dirty;
+ DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
struct list_head free_list;
struct rcu_head rcu_head;
@@ -826,6 +810,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vb->va = va;
vb->free = VMAP_BBMAP_BITS;
vb->dirty = 0;
+ bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
INIT_LIST_HEAD(&vb->free_list);
@@ -837,6 +822,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
radix_tree_preload_end();
vbq = &get_cpu_var(vmap_block_queue);
+ vb->vbq = vbq;
spin_lock(&vbq->lock);
list_add_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
@@ -877,6 +863,7 @@ static void purge_fragmented_blocks(int cpu)
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
@@ -894,6 +881,11 @@ static void purge_fragmented_blocks(int cpu)
}
}
+static void purge_fragmented_blocks_thiscpu(void)
+{
+ purge_fragmented_blocks(smp_processor_id());
+}
+
static void purge_fragmented_blocks_allcpus(void)
{
int cpu;
@@ -908,6 +900,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
+ int purge = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -931,7 +924,17 @@ again:
if (vb->free < 1UL << order)
goto next;
- i = VMAP_BBMAP_BITS - vb->free;
+ i = bitmap_find_free_region(vb->alloc_map,
+ VMAP_BBMAP_BITS, order);
+
+ if (i < 0) {
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
+ /* fragmented and no outstanding allocations */
+ BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
+ purge = 1;
+ }
+ goto next;
+ }
addr = vb->va->va_start + (i << PAGE_SHIFT);
BUG_ON(addr_to_vb_idx(addr) !=
addr_to_vb_idx(vb->va->va_start));
@@ -947,6 +950,9 @@ next:
spin_unlock(&vb->lock);
}
+ if (purge)
+ purge_fragmented_blocks_thiscpu();
+
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
@@ -1024,16 +1030,15 @@ void vm_unmap_aliases(void)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i, j;
+ int i;
spin_lock(&vb->lock);
i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
- if (i < VMAP_BBMAP_BITS) {
+ while (i < VMAP_BBMAP_BITS) {
unsigned long s, e;
-
- j = find_last_bit(vb->dirty_map,
- VMAP_BBMAP_BITS);
- j = j + 1; /* need exclusive index */
+ int j;
+ j = find_next_zero_bit(vb->dirty_map,
+ VMAP_BBMAP_BITS, i);
s = vb->va->va_start + (i << PAGE_SHIFT);
e = vb->va->va_start + (j << PAGE_SHIFT);
@@ -1043,6 +1048,10 @@ void vm_unmap_aliases(void)
start = s;
if (e > end)
end = e;
+
+ i = j;
+ i = find_next_bit(vb->dirty_map,
+ VMAP_BBMAP_BITS, i);
}
spin_unlock(&vb->lock);
}
@@ -1085,12 +1094,6 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @node: prefer to allocate data structures on this node
* @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
- * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
- * faster than vmap so it's good. But if you mix long-life and short-life
- * objects with vm_map_ram(), it could consume lots of address space through
- * fragmentation (especially on a 32bit machine). You could see failures in
- * the end. Please use this function for short-lived objects.
- *
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
@@ -1122,7 +1125,6 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
}
EXPORT_SYMBOL(vm_map_ram);
-static struct vm_struct *vmlist __initdata;
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1182,14 +1184,10 @@ void __init vmalloc_init(void)
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
- struct vfree_deferred *p;
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
- p = &per_cpu(vfree_deferred, i);
- init_llist_head(&p->list);
- INIT_WORK(&p->wq, free_work);
}
/* Import existing vmlist entries. */
@@ -1268,12 +1266,11 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
vunmap_page_range(addr, end);
flush_tlb_kernel_range(addr, end);
}
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
{
unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + get_vm_area_size(area);
+ unsigned long end = addr + area->size - PAGE_SIZE;
int err;
err = vmap_page_range(addr, end, prot, *pages);
@@ -1286,28 +1283,41 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
}
EXPORT_SYMBOL_GPL(map_vm_area);
+/*** Old vmalloc interfaces ***/
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, const void *caller)
{
- spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
va->flags |= VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
}
-static void clear_vm_uninitialized_flag(struct vm_struct *vm)
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
{
- /*
- * Before removing VM_UNINITIALIZED,
- * we should make sure that vm has proper values.
- * Pair with smp_rmb() in show_numa_info().
- */
- smp_wmb();
- vm->flags &= ~VM_UNINITIALIZED;
+ struct vm_struct *tmp, **p;
+
+ vm->flags &= ~VM_UNLIST;
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr)
+ break;
+ }
+ vm->next = *p;
+ *p = vm;
+ write_unlock(&vmlist_lock);
+}
+
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ setup_vmalloc_vm(vm, va, flags, caller);
+ insert_vmalloc_vmlist(vm);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1318,8 +1328,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
struct vm_struct *area;
BUG_ON(in_interrupt());
- if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ if (flags & VM_IOREMAP) {
+ int bit = fls(size);
+
+ if (bit > IOREMAP_MAX_ORDER)
+ bit = IOREMAP_MAX_ORDER;
+ else if (bit < PAGE_SHIFT)
+ bit = PAGE_SHIFT;
+
+ align = 1ul << bit;
+ }
size = PAGE_ALIGN(size);
if (unlikely(!size))
@@ -1340,7 +1358,17 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * When this function is called from __vmalloc_node_range,
+ * we do not add vm_struct to vmlist here to avoid
+ * accessing uninitialized members of vm_struct such as
+ * pages and nr_pages fields. They will be set later.
+ * To distinguish it from others, we use a VM_UNLIST flag.
+ */
+ if (flags & VM_UNLIST)
+ setup_vmalloc_vm(area, va, flags, caller);
+ else
+ insert_vmalloc_vm(area, va, flags, caller);
return area;
}
@@ -1348,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1357,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
+ caller);
}
/**
@@ -1373,15 +1401,14 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- NUMA_NO_NODE, GFP_KERNEL,
- __builtin_return_address(0));
+ -1, GFP_KERNEL, __builtin_return_address(0));
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- NUMA_NO_NODE, GFP_KERNEL, caller);
+ -1, GFP_KERNEL, caller);
}
/**
@@ -1419,10 +1446,19 @@ struct vm_struct *remove_vm_area(const void *addr)
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
- va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- spin_unlock(&vmap_area_lock);
+ if (!(vm->flags & VM_UNLIST)) {
+ struct vm_struct *tmp, **p;
+ /*
+ * remove from list and disallow access to
+ * this vm_struct before unmap. (address range
+ * confliction is maintained by vmap.)
+ */
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+ ;
+ *p = tmp->next;
+ write_unlock(&vmlist_lock);
+ }
vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
@@ -1440,9 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (!addr)
return;
- if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
- addr))
+ if ((PAGE_SIZE-1) & (unsigned long)addr) {
+ WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
return;
+ }
area = remove_vm_area(addr);
if (unlikely(!area)) {
@@ -1473,7 +1510,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1482,26 +1519,15 @@ static void __vunmap(const void *addr, int deallocate_pages)
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
- *
- * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ * Must not be called in interrupt context.
*/
void vfree(const void *addr)
{
- BUG_ON(in_nmi());
+ BUG_ON(in_interrupt());
kmemleak_free(addr);
- if (!addr)
- return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
- __vunmap(addr, 1);
+ __vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
@@ -1518,8 +1544,7 @@ void vunmap(const void *addr)
{
BUG_ON(in_interrupt());
might_sleep();
- if (addr)
- __vunmap(addr, 0);
+ __vunmap(addr, 0);
}
EXPORT_SYMBOL(vunmap);
@@ -1561,26 +1586,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, int node, const void *caller)
{
const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
- PAGE_KERNEL, node, area->caller);
+ PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
+ area->caller = caller;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
@@ -1591,7 +1617,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
- if (node == NUMA_NO_NODE)
+ if (node < 0)
page = alloc_page(tmp_mask);
else
page = alloc_pages_node(node, tmp_mask, order);
@@ -1624,7 +1650,7 @@ fail:
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
+ * @node: node to use for allocation or -1
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
@@ -1643,28 +1669,27 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
start, end, node, gfp_mask, caller);
if (!area)
goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
if (!addr)
return NULL;
/*
- * In this function, newly allocated vm_struct has VM_UNINITIALIZED
- * flag. It means that vm_struct is not fully initialized.
- * Now, it is fully initialized, so remove this flag here.
+ * In this function, newly allocated vm_struct is not added
+ * to vmlist at __get_vm_area_node(). so, it is added here.
*/
- clear_vm_uninitialized_flag(area);
+ insert_vmalloc_vmlist(area);
/*
- * A ref_count = 2 is needed because vm_struct allocated in
- * __get_vm_area_node() contains a reference to the virtual address of
- * the vmalloc'ed block.
+ * A ref_count = 3 is needed because the vm_struct and vmap_area
+ * structures allocated in the __get_vm_area_node() function contain
+ * references to the virtual address of the vmalloc'ed block.
*/
- kmemleak_alloc(addr, real_size, 2, gfp_mask);
+ kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
@@ -1681,7 +1706,7 @@ fail:
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
+ * @node: node to use for allocation or -1
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
@@ -1698,7 +1723,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, prot, -1,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
@@ -1721,8 +1746,7 @@ static inline void *__vmalloc_node_flags(unsigned long size,
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_HIGHMEM);
+ return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
@@ -1738,7 +1762,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
+ return __vmalloc_node_flags(size, -1,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -1757,8 +1781,7 @@ void *vmalloc_user(unsigned long size)
ret = __vmalloc_node(size, SHMLBA,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
+ PAGE_KERNEL, -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
area->flags |= VM_USERMAP;
@@ -1823,7 +1846,7 @@ EXPORT_SYMBOL(vzalloc_node);
void *vmalloc_exec(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ -1, __builtin_return_address(0));
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1844,7 +1867,7 @@ void *vmalloc_exec(unsigned long size)
void *vmalloc_32(unsigned long size)
{
return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ -1, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -1861,7 +1884,7 @@ void *vmalloc_32_user(unsigned long size)
void *ret;
ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ -1, __builtin_return_address(0));
if (ret) {
area = find_vm_area(ret);
area->flags |= VM_USERMAP;
@@ -1979,8 +2002,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
long vread(char *buf, char *addr, unsigned long count)
{
- struct vmap_area *va;
- struct vm_struct *vm;
+ struct vm_struct *tmp;
char *vaddr, *buf_start = buf;
unsigned long buflen = count;
unsigned long n;
@@ -1989,17 +2011,10 @@ long vread(char *buf, char *addr, unsigned long count)
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
-
- if (!(va->flags & VM_VM_AREA))
- continue;
-
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
+ read_lock(&vmlist_lock);
+ for (tmp = vmlist; count && tmp; tmp = tmp->next) {
+ vaddr = (char *) tmp->addr;
+ if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2009,10 +2024,10 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + get_vm_area_size(vm) - addr;
+ n = vaddr + tmp->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(vm->flags & VM_IOREMAP))
+ if (!(tmp->flags & VM_IOREMAP))
aligned_vread(buf, addr, n);
else /* IOREMAP area is treated as memory hole */
memset(buf, 0, n);
@@ -2021,7 +2036,7 @@ long vread(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- spin_unlock(&vmap_area_lock);
+ read_unlock(&vmlist_lock);
if (buf == buf_start)
return 0;
@@ -2060,8 +2075,7 @@ finished:
long vwrite(char *buf, char *addr, unsigned long count)
{
- struct vmap_area *va;
- struct vm_struct *vm;
+ struct vm_struct *tmp;
char *vaddr;
unsigned long n, buflen;
int copied = 0;
@@ -2071,17 +2085,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
buflen = count;
- spin_lock(&vmap_area_lock);
- list_for_each_entry(va, &vmap_area_list, list) {
- if (!count)
- break;
-
- if (!(va->flags & VM_VM_AREA))
- continue;
-
- vm = va->vm;
- vaddr = (char *) vm->addr;
- if (addr >= vaddr + get_vm_area_size(vm))
+ read_lock(&vmlist_lock);
+ for (tmp = vmlist; count && tmp; tmp = tmp->next) {
+ vaddr = (char *) tmp->addr;
+ if (addr >= vaddr + tmp->size - PAGE_SIZE)
continue;
while (addr < vaddr) {
if (count == 0)
@@ -2090,10 +2097,10 @@ long vwrite(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + get_vm_area_size(vm) - addr;
+ n = vaddr + tmp->size - PAGE_SIZE - addr;
if (n > count)
n = count;
- if (!(vm->flags & VM_IOREMAP)) {
+ if (!(tmp->flags & VM_IOREMAP)) {
aligned_vwrite(buf, addr, n);
copied++;
}
@@ -2102,50 +2109,49 @@ long vwrite(char *buf, char *addr, unsigned long count)
count -= n;
}
finished:
- spin_unlock(&vmap_area_lock);
+ read_unlock(&vmlist_lock);
if (!copied)
return 0;
return buflen;
}
/**
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
- * @vma: vma to cover
- * @uaddr: target user address to start at
- * @kaddr: virtual address of vmalloc kernel memory
- * @size: size of map area
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
*
* Returns: 0 for success, -Exxx on failure
*
- * This function checks that @kaddr is a valid vmalloc'ed area,
- * and that it is big enough to cover the range starting at
- * @uaddr in @vma. Will return failure if that criteria isn't
- * met.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
*
* Similar to remap_pfn_range() (see mm/memory.c)
*/
-int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
- void *kaddr, unsigned long size)
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
{
struct vm_struct *area;
+ unsigned long uaddr = vma->vm_start;
+ unsigned long usize = vma->vm_end - vma->vm_start;
- size = PAGE_ALIGN(size);
-
- if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
+ if ((PAGE_SIZE-1) & (unsigned long)addr)
return -EINVAL;
- area = find_vm_area(kaddr);
+ area = find_vm_area(addr);
if (!area)
return -EINVAL;
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
return -EINVAL;
+ addr += pgoff << PAGE_SHIFT;
do {
- struct page *page = vmalloc_to_page(kaddr);
+ struct page *page = vmalloc_to_page(addr);
int ret;
ret = vm_insert_page(vma, uaddr, page);
@@ -2153,44 +2159,21 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
return ret;
uaddr += PAGE_SIZE;
- kaddr += PAGE_SIZE;
- size -= PAGE_SIZE;
- } while (size > 0);
+ addr += PAGE_SIZE;
+ usize -= PAGE_SIZE;
+ } while (usize > 0);
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range_partial);
-
-/**
- * remap_vmalloc_range - map vmalloc pages to userspace
- * @vma: vma to cover (map full range of vma)
- * @addr: vmalloc memory
- * @pgoff: number of pages into addr before first page to map
- *
- * Returns: 0 for success, -Exxx on failure
- *
- * This function checks that addr is a valid vmalloc'ed area, and
- * that it is big enough to cover the vma. Will return failure if
- * that criteria isn't met.
- *
- * Similar to remap_pfn_range() (see mm/memory.c)
- */
-int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
- unsigned long pgoff)
-{
- return remap_vmalloc_range_partial(vma, vma->vm_start,
- addr + (pgoff << PAGE_SHIFT),
- vma->vm_end - vma->vm_start);
-}
EXPORT_SYMBOL(remap_vmalloc_range);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __weak vmalloc_sync_all(void)
+void __attribute__((weak)) vmalloc_sync_all(void)
{
}
@@ -2497,8 +2480,8 @@ found:
/* insert all vm's */
for (area = 0; area < nr_vms; area++)
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
- pcpu_get_vm_areas);
+ insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ pcpu_get_vm_areas);
kfree(vas);
return vms;
@@ -2533,19 +2516,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
- __acquires(&vmap_area_lock)
+ __acquires(&vmlist_lock)
{
loff_t n = *pos;
- struct vmap_area *va;
+ struct vm_struct *v;
- spin_lock(&vmap_area_lock);
- va = list_entry((&vmap_area_list)->next, typeof(*va), list);
- while (n > 0 && &va->list != &vmap_area_list) {
+ read_lock(&vmlist_lock);
+ v = vmlist;
+ while (n > 0 && v) {
n--;
- va = list_entry(va->list.next, typeof(*va), list);
+ v = v->next;
}
- if (!n && &va->list != &vmap_area_list)
- return va;
+ if (!n)
+ return v;
return NULL;
@@ -2553,20 +2536,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct vmap_area *va = p, *next;
+ struct vm_struct *v = p;
++*pos;
- next = list_entry(va->list.next, typeof(*va), list);
- if (&next->list != &vmap_area_list)
- return next;
-
- return NULL;
+ return v->next;
}
static void s_stop(struct seq_file *m, void *p)
- __releases(&vmap_area_lock)
+ __releases(&vmlist_lock)
{
- spin_unlock(&vmap_area_lock);
+ read_unlock(&vmlist_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -2577,11 +2556,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
- if (v->flags & VM_UNINITIALIZED)
- return;
-
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2595,17 +2569,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static int s_show(struct seq_file *m, void *p)
{
- struct vmap_area *va = p;
- struct vm_struct *v;
-
- /*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
- */
- if (!(va->flags & VM_VM_AREA))
- return 0;
-
- v = va->vm;
+ struct vm_struct *v = p;
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
@@ -2620,19 +2584,19 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
if (v->flags & VM_IOREMAP)
- seq_puts(m, " ioremap");
+ seq_printf(m, " ioremap");
if (v->flags & VM_ALLOC)
- seq_puts(m, " vmalloc");
+ seq_printf(m, " vmalloc");
if (v->flags & VM_MAP)
- seq_puts(m, " vmap");
+ seq_printf(m, " vmap");
if (v->flags & VM_USERMAP)
- seq_puts(m, " user");
+ seq_printf(m, " user");
if (v->flags & VM_VPAGES)
- seq_puts(m, " vpages");
+ seq_printf(m, " vpages");
show_numa_info(m, v);
seq_putc(m, '\n');
@@ -2678,53 +2642,5 @@ static int __init proc_vmalloc_init(void)
return 0;
}
module_init(proc_vmalloc_init);
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
- struct vmap_area *va;
- unsigned long free_area_size;
- unsigned long prev_end;
-
- vmi->used = 0;
- vmi->largest_chunk = 0;
-
- prev_end = VMALLOC_START;
-
- spin_lock(&vmap_area_lock);
-
- if (list_empty(&vmap_area_list)) {
- vmi->largest_chunk = VMALLOC_TOTAL;
- goto out;
- }
-
- list_for_each_entry(va, &vmap_area_list, list) {
- unsigned long addr = va->va_start;
-
- /*
- * Some archs keep another range for modules in vmalloc space
- */
- if (addr < VMALLOC_START)
- continue;
- if (addr >= VMALLOC_END)
- break;
-
- if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
- continue;
-
- vmi->used += (va->va_end - va->va_start);
-
- free_area_size = addr - prev_end;
- if (vmi->largest_chunk < free_area_size)
- vmi->largest_chunk = free_area_size;
-
- prev_end = va->va_end;
- }
-
- if (VMALLOC_END - prev_end > vmi->largest_chunk)
- vmi->largest_chunk = VMALLOC_END - prev_end;
-
-out:
- spin_unlock(&vmap_area_lock);
-}
#endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
deleted file mode 100644
index d4042e75f7c..00000000000
--- a/mm/vmpressure.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Linux VM pressure
- *
- * Copyright 2012 Linaro Ltd.
- * Anton Vorontsov <anton.vorontsov@linaro.org>
- *
- * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
- * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/log2.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/eventfd.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/printk.h>
-#include <linux/vmpressure.h>
-
-/*
- * The window size (vmpressure_win) is the number of scanned pages before
- * we try to analyze scanned/reclaimed ratio. So the window is used as a
- * rate-limit tunable for the "low" level notification, and also for
- * averaging the ratio for medium/critical levels. Using small window
- * sizes can cause lot of false positives, but too big window size will
- * delay the notifications.
- *
- * As the vmscan reclaimer logic works with chunks which are multiple of
- * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
- *
- * TODO: Make the window size depend on machine size, as we do for vmstat
- * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
- */
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
-
-/*
- * These thresholds are used when we account memory pressure through
- * scanned/reclaimed ratio. The current values were chosen empirically. In
- * essence, they are percents: the higher the value, the more number
- * unsuccessful reclaims there were.
- */
-static const unsigned int vmpressure_level_med = 60;
-static const unsigned int vmpressure_level_critical = 95;
-
-/*
- * When there are too little pages left to scan, vmpressure() may miss the
- * critical pressure as number of pages will be less than "window size".
- * However, in that case the vmscan priority will raise fast as the
- * reclaimer will try to scan LRUs more deeply.
- *
- * The vmscan logic considers these special priorities:
- *
- * prio == DEF_PRIORITY (12): reclaimer starts with that value
- * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
- * prio == 0 : close to OOM, kernel scans every page in an lru
- *
- * Any value in this range is acceptable for this tunable (i.e. from 12 to
- * 0). Current value for the vmpressure_level_critical_prio is chosen
- * empirically, but the number, in essence, means that we consider
- * critical level when scanning depth is ~10% of the lru size (vmscan
- * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
- * eights).
- */
-static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
-
-static struct vmpressure *work_to_vmpressure(struct work_struct *work)
-{
- return container_of(work, struct vmpressure, work);
-}
-
-static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
-{
- struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- memcg = parent_mem_cgroup(memcg);
- if (!memcg)
- return NULL;
- return memcg_to_vmpressure(memcg);
-}
-
-enum vmpressure_levels {
- VMPRESSURE_LOW = 0,
- VMPRESSURE_MEDIUM,
- VMPRESSURE_CRITICAL,
- VMPRESSURE_NUM_LEVELS,
-};
-
-static const char * const vmpressure_str_levels[] = {
- [VMPRESSURE_LOW] = "low",
- [VMPRESSURE_MEDIUM] = "medium",
- [VMPRESSURE_CRITICAL] = "critical",
-};
-
-static enum vmpressure_levels vmpressure_level(unsigned long pressure)
-{
- if (pressure >= vmpressure_level_critical)
- return VMPRESSURE_CRITICAL;
- else if (pressure >= vmpressure_level_med)
- return VMPRESSURE_MEDIUM;
- return VMPRESSURE_LOW;
-}
-
-static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
- unsigned long reclaimed)
-{
- unsigned long scale = scanned + reclaimed;
- unsigned long pressure;
-
- /*
- * We calculate the ratio (in percents) of how many pages were
- * scanned vs. reclaimed in a given time frame (window). Note that
- * time is in VM reclaimer's "ticks", i.e. number of pages
- * scanned. This makes it possible to set desired reaction time
- * and serves as a ratelimit.
- */
- pressure = scale - (reclaimed * scale / scanned);
- pressure = pressure * 100 / scale;
-
- pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
- scanned, reclaimed);
-
- return vmpressure_level(pressure);
-}
-
-struct vmpressure_event {
- struct eventfd_ctx *efd;
- enum vmpressure_levels level;
- struct list_head node;
-};
-
-static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
-{
- struct vmpressure_event *ev;
- enum vmpressure_levels level;
- bool signalled = false;
-
- level = vmpressure_calc_level(scanned, reclaimed);
-
- mutex_lock(&vmpr->events_lock);
-
- list_for_each_entry(ev, &vmpr->events, node) {
- if (level >= ev->level) {
- eventfd_signal(ev->efd, 1);
- signalled = true;
- }
- }
-
- mutex_unlock(&vmpr->events_lock);
-
- return signalled;
-}
-
-static void vmpressure_work_fn(struct work_struct *work)
-{
- struct vmpressure *vmpr = work_to_vmpressure(work);
- unsigned long scanned;
- unsigned long reclaimed;
-
- /*
- * Several contexts might be calling vmpressure(), so it is
- * possible that the work was rescheduled again before the old
- * work context cleared the counters. In that case we will run
- * just after the old work returns, but then scanned might be zero
- * here. No need for any locks here since we don't care if
- * vmpr->reclaimed is in sync.
- */
- if (!vmpr->scanned)
- return;
-
- spin_lock(&vmpr->sr_lock);
- scanned = vmpr->scanned;
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
- spin_unlock(&vmpr->sr_lock);
-
- do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
- break;
- /*
- * If not handled, propagate the event upward into the
- * hierarchy.
- */
- } while ((vmpr = vmpressure_parent(vmpr)));
-}
-
-/**
- * vmpressure() - Account memory pressure through scanned/reclaimed ratio
- * @gfp: reclaimer's gfp mask
- * @memcg: cgroup memory controller handle
- * @scanned: number of pages scanned
- * @reclaimed: number of pages reclaimed
- *
- * This function should be called from the vmscan reclaim path to account
- * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
- * pressure index is then further refined and averaged over time.
- *
- * This function does not return any value.
- */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
- unsigned long scanned, unsigned long reclaimed)
-{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
-
- /*
- * Here we only want to account pressure that userland is able to
- * help us with. For example, suppose that DMA zone is under
- * pressure; if we notify userland about that kind of pressure,
- * then it will be mostly a waste as it will trigger unnecessary
- * freeing of memory by userland (since userland is more likely to
- * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
- * is why we include only movable, highmem and FS/IO pages.
- * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
- * we account it too.
- */
- if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
- return;
-
- /*
- * If we got here with no pages scanned, then that is an indicator
- * that reclaimer was unable to find any shrinkable LRUs at the
- * current scanning depth. But it does not mean that we should
- * report the critical pressure, yet. If the scanning priority
- * (scanning depth) goes too high (deep), we will be notified
- * through vmpressure_prio(). But so far, keep calm.
- */
- if (!scanned)
- return;
-
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
-
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
-}
-
-/**
- * vmpressure_prio() - Account memory pressure through reclaimer priority level
- * @gfp: reclaimer's gfp mask
- * @memcg: cgroup memory controller handle
- * @prio: reclaimer's priority
- *
- * This function should be called from the reclaim path every time when
- * the vmscan's reclaiming priority (scanning depth) changes.
- *
- * This function does not return any value.
- */
-void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
-{
- /*
- * We only use prio for accounting critical level. For more info
- * see comment for vmpressure_level_critical_prio variable above.
- */
- if (prio > vmpressure_level_critical_prio)
- return;
-
- /*
- * OK, the prio is below the threshold, updating vmpressure
- * information before shrinker dives into long shrinking of long
- * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
- * to the vmpressure() basically means that we signal 'critical'
- * level.
- */
- vmpressure(gfp, memcg, vmpressure_win, 0);
-}
-
-/**
- * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @memcg: memcg that is interested in vmpressure notifications
- * @eventfd: eventfd context to link notifications with
- * @args: event arguments (used to set up a pressure level threshold)
- *
- * This function associates eventfd context with the vmpressure
- * infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a string that denotes pressure level
- * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
- * "critical").
- *
- * To be used as memcg event method.
- */
-int vmpressure_register_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd, const char *args)
-{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
- struct vmpressure_event *ev;
- int level;
-
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
- if (!strcmp(vmpressure_str_levels[level], args))
- break;
- }
-
- if (level >= VMPRESSURE_NUM_LEVELS)
- return -EINVAL;
-
- ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev)
- return -ENOMEM;
-
- ev->efd = eventfd;
- ev->level = level;
-
- mutex_lock(&vmpr->events_lock);
- list_add(&ev->node, &vmpr->events);
- mutex_unlock(&vmpr->events_lock);
-
- return 0;
-}
-
-/**
- * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @memcg: memcg handle
- * @eventfd: eventfd context that was used to link vmpressure with the @cg
- *
- * This function does internal manipulations to detach the @eventfd from
- * the vmpressure notifications, and then frees internal resources
- * associated with the @eventfd (but the @eventfd itself is not freed).
- *
- * To be used as memcg event method.
- */
-void vmpressure_unregister_event(struct mem_cgroup *memcg,
- struct eventfd_ctx *eventfd)
-{
- struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
- struct vmpressure_event *ev;
-
- mutex_lock(&vmpr->events_lock);
- list_for_each_entry(ev, &vmpr->events, node) {
- if (ev->efd != eventfd)
- continue;
- list_del(&ev->node);
- kfree(ev);
- break;
- }
- mutex_unlock(&vmpr->events_lock);
-}
-
-/**
- * vmpressure_init() - Initialize vmpressure control structure
- * @vmpr: Structure to be initialized
- *
- * This function should be called on every allocated vmpressure structure
- * before any usage.
- */
-void vmpressure_init(struct vmpressure *vmpr)
-{
- spin_lock_init(&vmpr->sr_lock);
- mutex_init(&vmpr->events_lock);
- INIT_LIST_HEAD(&vmpr->events);
- INIT_WORK(&vmpr->work, vmpressure_work_fn);
-}
-
-/**
- * vmpressure_cleanup() - shuts down vmpressure control structure
- * @vmpr: Structure to be cleaned up
- *
- * This function should be called before the structure in which it is
- * embedded is cleaned up.
- */
-void vmpressure_cleanup(struct vmpressure *vmpr)
-{
- /*
- * Make sure there is no pending work before eventfd infrastructure
- * goes away.
- */
- flush_work(&vmpr->work);
-}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0f16ffe8eb6..196709f5ee5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -11,8 +11,6 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
@@ -21,7 +19,6 @@
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
-#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
@@ -45,13 +42,11 @@
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
-#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
-#include <linux/balloon_compaction.h>
#include "internal.h"
@@ -86,9 +81,6 @@ struct scan_control {
/* Scan (total_size >> priority) pages at once */
int priority;
- /* anon vs. file LRUs scanning "ratio" */
- int swappiness;
-
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
@@ -136,7 +128,7 @@ struct scan_control {
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-unsigned long vm_total_pages; /* The total number of pages which the VM controls */
+long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -153,25 +145,6 @@ static bool global_reclaim(struct scan_control *sc)
}
#endif
-static unsigned long zone_reclaimable_pages(struct zone *zone)
-{
- int nr;
-
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
-
- return nr;
-}
-
-bool zone_reclaimable(struct zone *zone)
-{
- return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
-
static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
@@ -181,31 +154,14 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
}
/*
- * Add a shrinker callback to be called from the vm.
+ * Add a shrinker callback to be called from the vm
*/
-int register_shrinker(struct shrinker *shrinker)
+void register_shrinker(struct shrinker *shrinker)
{
- size_t size = sizeof(*shrinker->nr_deferred);
-
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
- if (shrinker->flags & SHRINKER_NUMA_AWARE)
- size *= nr_node_ids;
-
- shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
- if (!shrinker->nr_deferred)
- return -ENOMEM;
-
+ atomic_long_set(&shrinker->nr_in_batch, 0);
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
- return 0;
}
EXPORT_SYMBOL(register_shrinker);
@@ -217,123 +173,18 @@ void unregister_shrinker(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
- kfree(shrinker->nr_deferred);
}
EXPORT_SYMBOL(unregister_shrinker);
-#define SHRINK_BATCH 128
-
-static unsigned long
-shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
- unsigned long nr_pages_scanned, unsigned long lru_pages)
-{
- unsigned long freed = 0;
- unsigned long long delta;
- long total_scan;
- long freeable;
- long nr;
- long new_nr;
- int nid = shrinkctl->nid;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
-
- freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0)
- return 0;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
-
- total_scan = nr;
- delta = (4 * nr_pages_scanned) / shrinker->seeks;
- delta *= freeable;
- do_div(delta, lru_pages + 1);
- total_scan += delta;
- if (total_scan < 0) {
- printk(KERN_ERR
- "shrink_slab: %pF negative objects to delete nr=%ld\n",
- shrinker->scan_objects, total_scan);
- total_scan = freeable;
- }
-
- /*
- * We need to avoid excessive windup on filesystem shrinkers
- * due to large numbers of GFP_NOFS allocations causing the
- * shrinkers to return -1 all the time. This results in a large
- * nr being built up so when a shrink that can do some work
- * comes along it empties the entire cache due to nr >>>
- * freeable. This is bad for sustaining a working set in
- * memory.
- *
- * Hence only allow the shrinker to scan the entire cache when
- * a large delta change is calculated directly.
- */
- if (delta < freeable / 4)
- total_scan = min(total_scan, freeable / 2);
-
- /*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
- */
- if (total_scan > freeable * 2)
- total_scan = freeable * 2;
-
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- nr_pages_scanned, lru_pages,
- freeable, delta, total_scan);
-
- /*
- * Normally, we should not scan less than batch_size objects in one
- * pass to avoid too frequent shrinker calls, but if the slab has less
- * than batch_size objects in total and we are really tight on memory,
- * we will try to reclaim all available objects, otherwise we can end
- * up failing allocations although there are plenty of reclaimable
- * objects spread over several slabs with usage less than the
- * batch_size.
- *
- * We detect the "tight on memory" situations by looking at the total
- * number of objects we want to scan (total_scan). If it is greater
- * than the total number of objects on slab (freeable), we must be
- * scanning at high prio and therefore should try to reclaim as much as
- * possible.
- */
- while (total_scan >= batch_size ||
- total_scan >= freeable) {
- unsigned long ret;
- unsigned long nr_to_scan = min(batch_size, total_scan);
-
- shrinkctl->nr_to_scan = nr_to_scan;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
-
- count_vm_events(SLABS_SCANNED, nr_to_scan);
- total_scan -= nr_to_scan;
-
- cond_resched();
- }
-
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates. If we exhausted the
- * scan, there is no need to do an update.
- */
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
- &shrinker->nr_deferred[nid]);
- else
- new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
-
- trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
- return freed;
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc,
+ unsigned long nr_to_scan)
+{
+ sc->nr_to_scan = nr_to_scan;
+ return (*shrinker->shrink)(shrinker, sc);
}
+#define SHRINK_BATCH 128
/*
* Call the shrink functions to age shrinkable caches
*
@@ -353,46 +204,115 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(struct shrink_control *shrinkctl,
+unsigned long shrink_slab(struct shrink_control *shrink,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long freed = 0;
+ unsigned long ret = 0;
if (nr_pages_scanned == 0)
nr_pages_scanned = SWAP_CLUSTER_MAX;
if (!down_read_trylock(&shrinker_rwsem)) {
- /*
- * If we would return 0, our callers would understand that we
- * have nothing else to shrink and give up trying. By returning
- * 1 we keep it going and assume we'll be able to shrink next
- * time.
- */
- freed = 1;
+ /* Assume we'll be able to shrink next time */
+ ret = 1;
goto out;
}
list_for_each_entry(shrinker, &shrinker_list, list) {
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
- shrinkctl->nid = 0;
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ unsigned long long delta;
+ long total_scan;
+ long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ if (max_pass <= 0)
continue;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= max_pass;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR "shrink_slab: %pF negative objects to "
+ "delete nr=%ld\n",
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}
- for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
- if (node_online(shrinkctl->nid))
- freed += shrink_slab_node(shrinkctl, shrinker,
- nr_pages_scanned, lru_pages);
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);
+
+ while (total_scan >= batch_size) {
+ int nr_before;
+
+ nr_before = do_shrinker_shrink(shrinker, shrink, 0);
+ shrink_ret = do_shrinker_shrink(shrinker, shrink,
+ batch_size);
+ if (shrink_ret == -1)
+ break;
+ if (shrink_ret < nr_before)
+ ret += nr_before - shrink_ret;
+ count_vm_events(SLABS_SCANNED, batch_size);
+ total_scan -= batch_size;
+
+ cond_resched();
}
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_in_batch);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_in_batch);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
cond_resched();
- return freed;
+ return ret;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -464,7 +384,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
- * If this process is currently in __generic_file_write_iter() against
+ * If this process is currently in __generic_file_aio_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
@@ -483,7 +403,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- pr_info("%s: orphaned page\n", __func__);
+ printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -529,8 +449,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
*/
-static int __remove_mapping(struct address_space *mapping, struct page *page,
- bool reclaimed)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -576,23 +495,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swapcache_free(swap, page);
} else {
void (*freepage)(struct page *);
- void *shadow = NULL;
freepage = mapping->a_ops->freepage;
- /*
- * Remember a shadow entry for reclaimed file cache in
- * order to detect refaults, thus thrashing, later on.
- *
- * But don't store shadows in an address space that is
- * already exiting. This is not just an optizimation,
- * inode reclaim needs to empty out the radix tree or
- * the nodes are lost. Don't plant shadows behind its
- * back.
- */
- if (reclaimed && page_is_file_cache(page) &&
- !mapping_exiting(mapping))
- shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow);
+
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
@@ -615,7 +521,7 @@ cannot_free:
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
- if (__remove_mapping(mapping, page, false)) {
+ if (__remove_mapping(mapping, page)) {
/*
* Unfreezing the refcount with 1 rather than 2 effectively
* drops the pagecache ref for us without requiring another
@@ -638,10 +544,11 @@ int remove_mapping(struct address_space *mapping, struct page *page)
*/
void putback_lru_page(struct page *page)
{
- bool is_unevictable;
+ int lru;
+ int active = !!TestClearPageActive(page);
int was_unevictable = PageUnevictable(page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageLRU(page));
redo:
ClearPageUnevictable(page);
@@ -653,14 +560,14 @@ redo:
* unevictable page on [in]active list.
* We know how to handle that.
*/
- is_unevictable = false;
- lru_cache_add(page);
+ lru = active + page_lru_base_type(page);
+ lru_cache_add_lru(page, lru);
} else {
/*
* Put unevictable pages directly on zone's unevictable
* list.
*/
- is_unevictable = true;
+ lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
/*
* When racing with an mlock or AS_UNEVICTABLE clearing
@@ -680,7 +587,7 @@ redo:
* page is on unevictable list, it never be freed. To avoid that,
* check after we added it to the list, again.
*/
- if (is_unevictable && page_evictable(page)) {
+ if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
if (!isolate_lru_page(page)) {
put_page(page);
goto redo;
@@ -691,9 +598,9 @@ redo:
*/
}
- if (was_unevictable && !is_unevictable)
+ if (was_unevictable && lru != LRU_UNEVICTABLE)
count_vm_event(UNEVICTABLE_PGRESCUED);
- else if (!was_unevictable && is_unevictable)
+ else if (!was_unevictable && lru == LRU_UNEVICTABLE)
count_vm_event(UNEVICTABLE_PGCULLED);
put_page(page); /* drop ref from isolate */
@@ -761,35 +668,6 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
}
-/* Check if a page is dirty or under writeback */
-static void page_check_dirty_writeback(struct page *page,
- bool *dirty, bool *writeback)
-{
- struct address_space *mapping;
-
- /*
- * Anonymous pages are not handled by flushers and must be written
- * from reclaim context. Do not stall reclaim based on them
- */
- if (!page_is_file_cache(page)) {
- *dirty = false;
- *writeback = false;
- return;
- }
-
- /* By default assume that the page flags are accurate */
- *dirty = PageDirty(page);
- *writeback = PageWriteback(page);
-
- /* Verify dirty/writeback state if the filesystem supports it */
- if (!page_has_private(page))
- return;
-
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops->is_dirty_writeback)
- mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
-}
-
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -798,21 +676,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_unqueued_dirty,
- unsigned long *ret_nr_congested,
unsigned long *ret_nr_writeback,
- unsigned long *ret_nr_immediate,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
- unsigned long nr_unqueued_dirty = 0;
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
cond_resched();
@@ -822,7 +695,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct page *page;
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
- bool dirty, writeback;
cond_resched();
@@ -832,8 +704,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (!trylock_page(page))
goto keep;
- VM_BUG_ON_PAGE(PageActive(page), page);
- VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -850,77 +722,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
- /*
- * The number of dirty pages determines if a zone is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
- */
- page_check_dirty_writeback(page, &dirty, &writeback);
- if (dirty || writeback)
- nr_dirty++;
-
- if (dirty && !writeback)
- nr_unqueued_dirty++;
-
- /*
- * Treat this page as congested if the underlying BDI is or if
- * pages are cycling through the LRU so quickly that the
- * pages marked for immediate reclaim are making it to the
- * end of the LRU a second time.
- */
- mapping = page_mapping(page);
- if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
- (writeback && PageReclaim(page)))
- nr_congested++;
-
- /*
- * If a page at the tail of the LRU is under writeback, there
- * are three cases to consider.
- *
- * 1) If reclaim is encountering an excessive number of pages
- * under writeback and this page is both under writeback and
- * PageReclaim then it indicates that pages are being queued
- * for IO but are being recycled through the LRU before the
- * IO can complete. Waiting on the page itself risks an
- * indefinite stall if it is impossible to writeback the
- * page due to IO error or disconnected storage so instead
- * note that the LRU is being scanned too quickly and the
- * caller can stall after page list has been processed.
- *
- * 2) Global reclaim encounters a page, memcg encounters a
- * page that is not marked for immediate reclaim or
- * the caller does not have __GFP_IO. In this case mark
- * the page for immediate reclaim and continue scanning.
- *
- * __GFP_IO is checked because a loop driver thread might
- * enter reclaim, and deadlock if it waits on a page for
- * which it is needed to do the write (loop masks off
- * __GFP_IO|__GFP_FS for this reason); but more thought
- * would probably show more reasons.
- *
- * Don't require __GFP_FS, since we're not going into the
- * FS, just waiting on its writeback completion. Worryingly,
- * ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
- * may_enter_fs here is liable to OOM on them.
- *
- * 3) memcg encounters a page that is not already marked
- * PageReclaim. memcg does not have any dirty pages
- * throttling so we could easily OOM just because too many
- * pages are in writeback and there is nothing else to
- * reclaim. Wait for the writeback to complete.
- */
if (PageWriteback(page)) {
- /* Case 1 above */
- if (current_is_kswapd() &&
- PageReclaim(page) &&
- zone_is_reclaim_writeback(zone)) {
- nr_immediate++;
- goto keep_locked;
-
- /* Case 2 above */
- } else if (global_reclaim(sc) ||
+ /*
+ * memcg doesn't have any dirty pages throttling so we
+ * could easily OOM just because too many pages are in
+ * writeback and there is nothing else to reclaim.
+ *
+ * Check __GFP_IO, certainly because a loop driver
+ * thread might enter reclaim, and deadlock if it waits
+ * on a page for which it is needed to do the write
+ * (loop masks off __GFP_IO|__GFP_FS for this reason);
+ * but more thought would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into
+ * the FS, just waiting on its writeback completion.
+ * Worryingly, ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+ * testing may_enter_fs here is liable to OOM on them.
+ */
+ if (global_reclaim(sc) ||
!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
/*
* This is slightly racy - end_page_writeback()
@@ -935,13 +755,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
SetPageReclaim(page);
nr_writeback++;
-
goto keep_locked;
-
- /* Case 3 above */
- } else {
- wait_on_page_writeback(page);
}
+ wait_on_page_writeback(page);
}
if (!force_reclaim)
@@ -964,14 +780,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageAnon(page) && !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (!add_to_swap(page, page_list))
+ if (!add_to_swap(page))
goto activate_locked;
may_enter_fs = 1;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
}
+ mapping = page_mapping(page);
+
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
@@ -990,14 +805,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
+ nr_dirty++;
+
/*
* Only kswapd can writeback filesystem pages to
- * avoid risk of stack overflow but only writeback
- * if many dirty pages have been encountered.
+ * avoid risk of stack overflow but do not writeback
+ * unless under significant pressure.
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !zone_is_reclaim_dirty(zone))) {
+ sc->priority >= DEF_PRIORITY - 2)) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -1020,6 +837,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Page is dirty, try to write it out here */
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
@@ -1085,7 +903,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
- if (!mapping || !__remove_mapping(mapping, page, true))
+ if (!mapping || !__remove_mapping(mapping, page))
goto keep_locked;
/*
@@ -1117,26 +935,32 @@ activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && vm_swap_full())
try_to_free_swap(page);
- VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON(PageActive(page));
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
+ VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
- free_hot_cold_page_list(&free_pages, true);
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
+ zone_set_flag(zone, ZONE_CONGESTED);
+
+ free_hot_cold_page_list(&free_pages, 1);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
mem_cgroup_uncharge_end();
*ret_nr_dirty += nr_dirty;
- *ret_nr_congested += nr_congested;
- *ret_nr_unqueued_dirty += nr_unqueued_dirty;
*ret_nr_writeback += nr_writeback;
- *ret_nr_immediate += nr_immediate;
return nr_reclaimed;
}
@@ -1148,23 +972,22 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
+ unsigned long ret, dummy1, dummy2;
struct page *page, *next;
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
- if (page_is_file_cache(page) && !PageDirty(page) &&
- !isolated_balloon_page(page)) {
+ if (page_is_file_cache(page) && !PageDirty(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
}
ret = shrink_page_list(&clean_pages, zone, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS,
- &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, true);
list_splice(&clean_pages, page_list);
- mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1278,7 +1101,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
+ VM_BUG_ON(!PageLRU(page));
switch (__isolate_lru_page(page, mode)) {
case 0:
@@ -1333,7 +1156,7 @@ int isolate_lru_page(struct page *page)
{
int ret = -EBUSY;
- VM_BUG_ON_PAGE(!page_count(page), page);
+ VM_BUG_ON(!page_count(page));
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
@@ -1404,7 +1227,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
struct page *page = lru_to_page(page_list);
int lru;
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
@@ -1445,19 +1268,6 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
}
/*
- * If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
- * In that case we should only throttle if the backing device it is
- * writing to is congested. In other cases it is safe to throttle.
- */
-static int current_may_throttle(void)
-{
- return !(current->flags & PF_LESS_THROTTLE) ||
- current->backing_dev_info == NULL ||
- bdi_write_congested(current->backing_dev_info);
-}
-
-/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
@@ -1470,10 +1280,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
unsigned long nr_dirty = 0;
- unsigned long nr_congested = 0;
- unsigned long nr_unqueued_dirty = 0;
unsigned long nr_writeback = 0;
- unsigned long nr_immediate = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
@@ -1515,9 +1322,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
- &nr_dirty, &nr_unqueued_dirty, &nr_congested,
- &nr_writeback, &nr_immediate,
- false);
+ &nr_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
@@ -1538,7 +1343,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&page_list, true);
+ free_hot_cold_page_list(&page_list, 1);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1550,51 +1355,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
- * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
- * of pages under pages flagged for immediate reclaim and stall if any
- * are encountered in the nr_immediate check below.
- */
- if (nr_writeback && nr_writeback == nr_taken)
- zone_set_flag(zone, ZONE_WRITEBACK);
-
- /*
- * memcg will stall in page writeback so only consider forcibly
- * stalling for global reclaim
- */
- if (global_reclaim(sc)) {
- /*
- * Tag a zone as congested if all the dirty pages scanned were
- * backed by a congested BDI and wait_iff_congested will stall.
- */
- if (nr_dirty && nr_dirty == nr_congested)
- zone_set_flag(zone, ZONE_CONGESTED);
-
- /*
- * If dirty pages are scanned that are not queued for IO, it
- * implies that flushers are not keeping up. In this case, flag
- * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context.
- */
- if (nr_unqueued_dirty == nr_taken)
- zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
-
- /*
- * If kswapd scans pages marked marked for immediate
- * reclaim and under writeback (nr_immediate), it implies
- * that pages are cycling through the LRU faster than
- * they are written so also forcibly stall.
- */
- if (nr_immediate && current_may_throttle())
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
- /*
- * Stall direct reclaim for IO completions if underlying BDIs or zone
- * is congested. Allow kswapd to continue until it starts encountering
- * unqueued dirty pages or cycling through the LRU too quickly.
+ * This scales the number of dirty pages that must be under writeback
+ * before throttling depending on priority. It is a simple backoff
+ * function that has the most effect in the range DEF_PRIORITY to
+ * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+ * in trouble and reclaim is considered to be in trouble.
+ *
+ * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
+ * DEF_PRIORITY-1 50% must be PageWriteback
+ * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
+ * ...
+ * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+ * isolated page is PageWriteback
*/
- if (!sc->hibernation_mode && !current_is_kswapd() &&
- current_may_throttle())
+ if (nr_writeback && nr_writeback >=
+ (nr_taken >> (DEF_PRIORITY - sc->priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1637,7 +1412,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
page = lru_to_page(list);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
@@ -1759,7 +1534,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
- free_hot_cold_page_list(&l_hold, true);
+ free_hot_cold_page_list(&l_hold, 1);
}
#ifdef CONFIG_SWAP
@@ -1804,6 +1579,16 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
}
#endif
+static int inactive_file_is_low_global(struct zone *zone)
+{
+ unsigned long active, inactive;
+
+ active = zone_page_state(zone, NR_ACTIVE_FILE);
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+
+ return (active > inactive);
+}
+
/**
* inactive_file_is_low - check if file pages need to be deactivated
* @lruvec: LRU vector to check
@@ -1820,13 +1605,10 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
*/
static int inactive_file_is_low(struct lruvec *lruvec)
{
- unsigned long inactive;
- unsigned long active;
-
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_inactive_file_is_low(lruvec);
- return active > inactive;
+ return inactive_file_is_low_global(lruvec_zone(lruvec));
}
static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1849,12 +1631,12 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
-enum scan_balance {
- SCAN_EQUAL,
- SCAN_FRACT,
- SCAN_ANON,
- SCAN_FILE,
-};
+static int vmscan_swappiness(struct scan_control *sc)
+{
+ if (global_reclaim(sc))
+ return vm_swappiness;
+ return mem_cgroup_swappiness(sc->target_mem_cgroup);
+}
/*
* Determine how aggressively the anon and file LRU lists should be
@@ -1868,18 +1650,15 @@ enum scan_balance {
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- u64 fraction[2];
- u64 denominator = 0; /* gcc */
- struct zone *zone = lruvec_zone(lruvec);
+ unsigned long anon, file, free;
unsigned long anon_prio, file_prio;
- enum scan_balance scan_balance;
- unsigned long anon, file;
- bool force_scan = false;
unsigned long ap, fp;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ u64 fraction[2], denominator;
enum lru_list lru;
- bool some_scanned;
- int pass;
+ int noswap = 0;
+ bool force_scan = false;
+ struct zone *zone = lruvec_zone(lruvec);
/*
* If the zone or memcg is small, nr[l] can be 0. This
@@ -1891,36 +1670,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && !zone_reclaimable(zone))
+ if (current_is_kswapd() && zone->all_unreclaimable)
force_scan = true;
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
- scan_balance = SCAN_FILE;
- goto out;
- }
-
- /*
- * Global reclaim will swap to prevent OOM even with no
- * swappiness, but memcg users want to use this knob to
- * disable swapping for individual groups completely when
- * using the memory controller's swap limit feature would be
- * too expensive.
- */
- if (!global_reclaim(sc) && !sc->swappiness) {
- scan_balance = SCAN_FILE;
- goto out;
- }
-
- /*
- * Do not apply any pressure balancing cleverness when the
- * system is close to OOM, scan both anon and file equally
- * (unless the swappiness setting disagrees with swapping).
- */
- if (!sc->priority && sc->swappiness) {
- scan_balance = SCAN_EQUAL;
+ if (!sc->may_swap || (nr_swap_pages <= 0)) {
+ noswap = 1;
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
goto out;
}
@@ -1929,40 +1689,34 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
get_lru_size(lruvec, LRU_INACTIVE_FILE);
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
if (global_reclaim(sc)) {
- unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
-
+ free = zone_page_state(zone, NR_FREE_PAGES);
if (unlikely(file + free <= high_wmark_pages(zone))) {
- scan_balance = SCAN_ANON;
+ /*
+ * If we have very few page cache pages, force-scan
+ * anon pages.
+ */
+ fraction[0] = 1;
+ fraction[1] = 0;
+ denominator = 1;
+ goto out;
+ } else if (!inactive_file_is_low_global(zone)) {
+ /*
+ * There is enough inactive page cache, do not
+ * reclaim anything from the working set right now.
+ */
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
goto out;
}
}
/*
- * There is enough inactive page cache, do not reclaim
- * anything from the anonymous working set right now.
- */
- if (!inactive_file_is_low(lruvec)) {
- scan_balance = SCAN_FILE;
- goto out;
- }
-
- scan_balance = SCAN_FRACT;
-
- /*
* With swappiness at 100, anonymous and file have the same priority.
* This scanning priority is essentially the inverse of IO cost.
*/
- anon_prio = sc->swappiness;
+ anon_prio = vmscan_swappiness(sc);
file_prio = 200 - anon_prio;
/*
@@ -2003,167 +1757,19 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- some_scanned = false;
- /* Only use force_scan on second pass. */
- for (pass = 0; !some_scanned && pass < 2; pass++) {
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
-
- size = get_lru_size(lruvec, lru);
- scan = size >> sc->priority;
-
- if (!scan && pass && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
-
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
- /*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
- */
- scan = div64_u64(scan * fraction[file],
- denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file)
- scan = 0;
- break;
- default:
- /* Look ma, no brain */
- BUG();
- }
- nr[lru] = scan;
- /*
- * Skip the second pass and don't force_scan,
- * if we found something to scan.
- */
- some_scanned |= !!scan;
- }
- }
-}
-
-/*
- * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
- */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-{
- unsigned long nr[NR_LRU_LISTS];
- unsigned long targets[NR_LRU_LISTS];
- unsigned long nr_to_scan;
- enum lru_list lru;
- unsigned long nr_reclaimed = 0;
- unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct blk_plug plug;
- bool scan_adjusted;
-
- get_scan_count(lruvec, sc, nr);
-
- /* Record the original scan target for proportional adjustments later */
- memcpy(targets, nr, sizeof(nr));
-
- /*
- * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
- * event that can occur when there is little memory pressure e.g.
- * multiple streaming readers/writers. Hence, we do not abort scanning
- * when the requested number of pages are reclaimed when scanning at
- * DEF_PRIORITY on the assumption that the fact we are direct
- * reclaiming implies that kswapd is not keeping up and it is best to
- * do a batch of work at once. For memcg reclaim one check is made to
- * abort proportional reclaim if either the file or anon lru has already
- * dropped to zero at the first pass.
- */
- scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
-
- blk_start_plug(&plug);
- while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
- nr[LRU_INACTIVE_FILE]) {
- unsigned long nr_anon, nr_file, percentage;
- unsigned long nr_scanned;
-
- for_each_evictable_lru(lru) {
- if (nr[lru]) {
- nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
- nr[lru] -= nr_to_scan;
-
- nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
- }
- }
-
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
- continue;
-
- /*
- * For kswapd and memcg, reclaim at least the number of pages
- * requested. Ensure that the anon and file LRUs are scanned
- * proportionally what was requested by get_scan_count(). We
- * stop reclaiming one LRU and reduce the amount scanning
- * proportional to the original scan target.
- */
- nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
- nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
-
- /*
- * It's just vindictive to attack the larger once the smaller
- * has gone to zero. And given the way we stop scanning the
- * smaller below, this makes sure that we only make one nudge
- * towards proportionality once we've got nr_to_reclaim.
- */
- if (!nr_file || !nr_anon)
- break;
-
- if (nr_file > nr_anon) {
- unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
- targets[LRU_ACTIVE_ANON] + 1;
- lru = LRU_BASE;
- percentage = nr_anon * 100 / scan_target;
- } else {
- unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
- targets[LRU_ACTIVE_FILE] + 1;
- lru = LRU_FILE;
- percentage = nr_file * 100 / scan_target;
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long scan;
+
+ scan = get_lru_size(lruvec, lru);
+ if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+ scan >>= sc->priority;
+ if (!scan && force_scan)
+ scan = SWAP_CLUSTER_MAX;
+ scan = div64_u64(scan * fraction[file], denominator);
}
-
- /* Stop scanning the smaller of the LRU */
- nr[lru] = 0;
- nr[lru + LRU_ACTIVE] = 0;
-
- /*
- * Recalculate the other LRU scan count based on its original
- * scan target and the percentage scanning already complete
- */
- lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
- nr_scanned = targets[lru] - nr[lru];
- nr[lru] = targets[lru] * (100 - percentage) / 100;
- nr[lru] -= min(nr[lru], nr_scanned);
-
- lru += LRU_ACTIVE;
- nr_scanned = targets[lru] - nr[lru];
- nr[lru] = targets[lru] * (100 - percentage) / 100;
- nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
+ nr[lru] = scan;
}
- blk_finish_plug(&plug);
- sc->nr_reclaimed += nr_reclaimed;
-
- /*
- * Even if we did not try to evict anon pages at all, we want to
- * rebalance the anon lru active/inactive ratio.
- */
- if (inactive_anon_is_low(lruvec))
- shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
- sc, LRU_ACTIVE_ANON);
-
- throttle_vm_writeout(sc->gfp_mask);
}
/* Use reclaim/compaction for costly allocs or under memory pressure */
@@ -2184,7 +1790,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct lruvec *lruvec,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
@@ -2224,15 +1830,15 @@ static inline bool should_continue_reclaim(struct zone *zone,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+ inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ if (nr_swap_pages > 0)
+ inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(zone, sc->order)) {
+ switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
case COMPACT_PARTIAL:
case COMPACT_CONTINUE:
return false;
@@ -2241,54 +1847,98 @@ static inline bool should_continue_reclaim(struct zone *zone,
}
}
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+/*
+ * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
+ */
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long nr_to_scan;
+ enum lru_list lru;
unsigned long nr_reclaimed, nr_scanned;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
- do {
- struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .zone = zone,
- .priority = sc->priority,
- };
- struct mem_cgroup *memcg;
+restart:
+ nr_reclaimed = 0;
+ nr_scanned = sc->nr_scanned;
+ get_scan_count(lruvec, sc, nr);
- nr_reclaimed = sc->nr_reclaimed;
- nr_scanned = sc->nr_scanned;
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+ for_each_evictable_lru(lru) {
+ if (nr[lru]) {
+ nr_to_scan = min_t(unsigned long,
+ nr[lru], SWAP_CLUSTER_MAX);
+ nr[lru] -= nr_to_scan;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
- do {
- struct lruvec *lruvec;
+ nr_reclaimed += shrink_list(lru, nr_to_scan,
+ lruvec, sc);
+ }
+ }
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is fine for the starting priority;
+ * we want to put equal scanning pressure on each zone.
+ * However, if the VM has a harder time of freeing pages,
+ * with multiple processes reclaiming pages, the total
+ * freeing target can get unreasonably large.
+ */
+ if (nr_reclaimed >= nr_to_reclaim &&
+ sc->priority < DEF_PRIORITY)
+ break;
+ }
+ blk_finish_plug(&plug);
+ sc->nr_reclaimed += nr_reclaimed;
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ /*
+ * Even if we did not try to evict anon pages at all, we want to
+ * rebalance the anon lru active/inactive ratio.
+ */
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
- sc->swappiness = mem_cgroup_swappiness(memcg);
- shrink_lruvec(lruvec, sc);
+ /* reclaim/compaction might need reclaim to continue */
+ if (should_continue_reclaim(lruvec, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
+ goto restart;
- /*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * zone.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!global_reclaim(sc) &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ throttle_vm_writeout(sc->gfp_mask);
+}
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = sc->priority,
+ };
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ do {
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
+ shrink_lruvec(lruvec, sc);
- } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ /*
+ * Limit reclaim has historically picked one memcg and
+ * scanned it with decreasing priority levels until
+ * nr_to_reclaim had been reclaimed. This priority
+ * cycle is thus over after a single memcg.
+ *
+ * Direct reclaim and kswapd, on the other hand, have
+ * to scan all memory cgroups to fulfill the overall
+ * scan target for the zone.
+ */
+ if (!global_reclaim(sc)) {
+ mem_cgroup_iter_break(root, memcg);
+ break;
+ }
+ memcg = mem_cgroup_iter(root, memcg, &reclaim);
+ } while (memcg);
}
/* Returns true if compaction should go ahead for a high-order request */
@@ -2307,8 +1957,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2353,26 +2004,16 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
- unsigned long lru_pages = 0;
bool aborted_reclaim = false;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- gfp_t orig_mask;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
- enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
- orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
- nodes_clear(shrink.nodes_to_scan);
-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
@@ -2384,12 +2025,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (global_reclaim(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
- if (sc->priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
+ if (zone->all_unreclaimable &&
+ sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
if (IS_ENABLED(CONFIG_COMPACTION)) {
/*
@@ -2401,8 +2038,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* noticeable problem, like transparent huge
* page allocations.
*/
- if ((zonelist_zone_idx(z) <= requested_highidx)
- && compaction_ready(zone, sc)) {
+ if (compaction_ready(zone, sc)) {
aborted_reclaim = true;
continue;
}
@@ -2425,29 +2061,14 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
shrink_zone(zone, sc);
}
- /*
- * Don't shrink slabs when reclaiming memory from over limit cgroups
- * but do shrink slab at least once when aborting reclaim for
- * compaction to avoid unevenly scanning file/anon LRU pages over slab
- * pages.
- */
- if (global_reclaim(sc)) {
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- if (reclaim_state) {
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- }
-
- /*
- * Restore to original mask to avoid the impact on the caller if we
- * promoted it to __GFP_HIGHMEM.
- */
- sc->gfp_mask = orig_mask;
-
return aborted_reclaim;
}
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
@@ -2461,7 +2082,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone))
+ if (!zone->all_unreclaimable)
return false;
}
@@ -2485,9 +2106,13 @@ static bool all_unreclaimable(struct zonelist *zonelist,
* else, the number of pages reclaimed
*/
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
- struct scan_control *sc)
+ struct scan_control *sc,
+ struct shrink_control *shrink)
{
unsigned long total_scanned = 0;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ struct zoneref *z;
+ struct zone *zone;
unsigned long writeback_threshold;
bool aborted_reclaim;
@@ -2497,23 +2122,34 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
count_vm_event(ALLOCSTALL);
do {
- vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
- sc->priority);
sc->nr_scanned = 0;
aborted_reclaim = shrink_zones(zonelist, sc);
+ /*
+ * Don't shrink slabs when reclaiming memory from
+ * over limit cgroups
+ */
+ if (global_reclaim(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
+ shrink_slab(shrink, sc->nr_scanned, lru_pages);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
/*
- * If we're getting trouble reclaiming, start doing
- * writepage even in laptop mode.
- */
- if (sc->priority < DEF_PRIORITY - 2)
- sc->may_writepage = 1;
-
- /*
* Try to write back as many pages as we just scanned. This
* tends to cause slow streaming writers to write data to the
* disk smoothly, at the dirtying rate, which is nice. But
@@ -2526,7 +2162,18 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
- } while (--sc->priority >= 0 && !aborted_reclaim);
+
+ /* Take a nap, wait for some writeback to complete */
+ if (!sc->hibernation_mode && sc->nr_scanned &&
+ sc->priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
+
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ &cpuset_current_mems_allowed,
+ &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+ }
+ } while (--sc->priority >= 0);
out:
delayacct_freepages_end();
@@ -2563,17 +2210,10 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!populated_zone(zone))
- continue;
-
pfmemalloc_reserve += min_wmark_pages(zone);
free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
- /* If there are no reserves (unexpected config) then do not throttle */
- if (!pfmemalloc_reserve)
- return true;
-
wmark_ok = free_pages > pfmemalloc_reserve / 2;
/* kswapd must be awake if processes are being throttled */
@@ -2598,9 +2238,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
- struct zoneref *z;
struct zone *zone;
- pg_data_t *pgdat = NULL;
+ int high_zoneidx = gfp_zone(gfp_mask);
+ pg_data_t *pgdat;
/*
* Kernel threads should not be throttled as they may be indirectly
@@ -2619,34 +2259,10 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (fatal_signal_pending(current))
goto out;
- /*
- * Check if the pfmemalloc reserves are ok by finding the first node
- * with a usable ZONE_NORMAL or lower zone. The expectation is that
- * GFP_KERNEL will be required for allocating network buffers when
- * swapping over the network so ZONE_HIGHMEM is unusable.
- *
- * Throttling is based on the first usable node and throttled processes
- * wait on a queue until kswapd makes progress and wakes them. There
- * is an affinity then between processes waking up and where reclaim
- * progress has been made assuming the process wakes on the same node.
- * More importantly, processes running on remote nodes will not compete
- * for remote pfmemalloc reserves and processes on different nodes
- * should make reasonable progress.
- */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_mask, nodemask) {
- if (zone_idx(zone) > ZONE_NORMAL)
- continue;
-
- /* Throttle based on the first usable node */
- pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
- goto out;
- break;
- }
-
- /* If no zone was usable by the allocation flags then do not throttle */
- if (!pgdat)
+ /* Check if the pfmemalloc reserves are ok */
+ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
goto out;
/* Account for the throttling */
@@ -2684,7 +2300,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
{
unsigned long nr_reclaimed;
struct scan_control sc = {
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_unmap = 1,
@@ -2694,6 +2310,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.target_mem_cgroup = NULL,
.nodemask = nodemask,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
/*
* Do not enter reclaim if fatal signal was delivered while throttled.
@@ -2707,7 +2326,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
sc.may_writepage,
gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -2729,7 +2348,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_swap = !noswap,
.order = 0,
.priority = 0,
- .swappiness = mem_cgroup_swappiness(memcg),
.target_mem_cgroup = memcg,
};
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
@@ -2775,6 +2393,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
@@ -2789,7 +2410,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.may_writepage,
sc.gfp_mask);
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2852,7 +2473,7 @@ static bool zone_balanced(struct zone *zone, int order,
*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long managed_pages = 0;
+ unsigned long present_pages = 0;
unsigned long balanced_pages = 0;
int i;
@@ -2863,7 +2484,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
if (!populated_zone(zone))
continue;
- managed_pages += zone->managed_pages;
+ present_pages += zone->present_pages;
/*
* A special case here:
@@ -2872,19 +2493,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well!
*/
- if (!zone_reclaimable(zone)) {
- balanced_pages += zone->managed_pages;
+ if (zone->all_unreclaimable) {
+ balanced_pages += zone->present_pages;
continue;
}
if (zone_balanced(zone, order, 0, i))
- balanced_pages += zone->managed_pages;
+ balanced_pages += zone->present_pages;
else if (!order)
return false;
}
if (order)
- return balanced_pages >= (managed_pages >> 2);
+ return balanced_pages >= (present_pages >> 2);
else
return true;
}
@@ -2920,88 +2541,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
}
/*
- * kswapd shrinks the zone by the number of pages required to reach
- * the high watermark.
- *
- * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim or if the lack of progress was due to pages under writeback.
- * This is used to determine if the scanning priority needs to be raised.
- */
-static bool kswapd_shrink_zone(struct zone *zone,
- int classzone_idx,
- struct scan_control *sc,
- unsigned long lru_pages,
- unsigned long *nr_attempted)
-{
- int testorder = sc->order;
- unsigned long balance_gap;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- struct shrink_control shrink = {
- .gfp_mask = sc->gfp_mask,
- };
- bool lowmem_pressure;
-
- /* Reclaim above the high watermark. */
- sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
-
- /*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order) !=
- COMPACT_SKIPPED)
- testorder = 0;
-
- /*
- * We put equal pressure on every zone, unless one zone has way too
- * many pages free already. The "too many pages" is defined as the
- * high wmark plus a "gap" where the gap is either the low
- * watermark or 1% of the zone, whichever is smaller.
- */
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-
- /*
- * If there is no low memory pressure or the zone is balanced then no
- * reclaim is necessary
- */
- lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
- balance_gap, classzone_idx))
- return true;
-
- shrink_zone(zone, sc);
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
-
- reclaim_state->reclaimed_slab = 0;
- shrink_slab(&shrink, sc->nr_scanned, lru_pages);
- sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
- zone_clear_flag(zone, ZONE_WRITEBACK);
-
- /*
- * If a zone reaches its high watermark, consider it to be no longer
- * congested. It's possible there are dirty pages backed by congested
- * BDIs but as pressure is relieved, speculatively avoid congestion
- * waits.
- */
- if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
- zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
- }
-
- return sc->nr_scanned >= sc->nr_to_reclaim;
-}
-
-/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
@@ -3025,28 +2564,40 @@ static bool kswapd_shrink_zone(struct zone *zone,
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
+ struct zone *unbalanced_zone;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
+ unsigned long total_scanned;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
.may_swap = 1,
- .may_writepage = !laptop_mode,
+ /*
+ * kswapd doesn't want to be bailed out while reclaim. because
+ * we want to put equal scanning pressure on each zone.
+ */
+ .nr_to_reclaim = ULONG_MAX,
.order = order,
.target_mem_cgroup = NULL,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+loop_again:
+ total_scanned = 0;
+ sc.priority = DEF_PRIORITY;
+ sc.nr_reclaimed = 0;
+ sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
do {
unsigned long lru_pages = 0;
- unsigned long nr_attempted = 0;
- bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
+ int has_under_min_watermark_zone = 0;
- sc.nr_reclaimed = 0;
+ unbalanced_zone = NULL;
/*
* Scan in the highmem->dma direction for the highest
@@ -3058,8 +2609,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (!populated_zone(zone))
continue;
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
/*
@@ -3083,46 +2634,20 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
end_zone = i;
break;
} else {
- /*
- * If balanced, clear the dirty and congested
- * flags
- */
+ /* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
}
}
-
if (i < 0)
goto out;
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- if (!populated_zone(zone))
- continue;
-
lru_pages += zone_reclaimable_pages(zone);
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
}
/*
- * If we're getting trouble reclaiming, start doing writepage
- * even in laptop mode.
- */
- if (sc.priority < DEF_PRIORITY - 2)
- sc.may_writepage = 1;
-
- /*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
@@ -3133,12 +2658,14 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
+ int nr_slab, testorder;
+ unsigned long balance_gap;
if (!populated_zone(zone))
continue;
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
+ if (zone->all_unreclaimable &&
+ sc.priority != DEF_PRIORITY)
continue;
sc.nr_scanned = 0;
@@ -3151,16 +2678,83 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
+ total_scanned += nr_soft_scanned;
+
+ /*
+ * We put equal pressure on every zone, unless
+ * one zone has way too many pages free
+ * already. The "too many pages" is defined
+ * as the high wmark plus a "gap" where the
+ * gap is either the low watermark or 1%
+ * of the zone, whichever is smaller.
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages +
+ KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ /*
+ * Kswapd reclaims only single pages with compaction
+ * enabled. Trying too hard to reclaim until contiguous
+ * free pages have become available can hurt performance
+ * by evicting too much useful data from memory.
+ * Do not reclaim more than needed for compaction.
+ */
+ testorder = order;
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+ compaction_suitable(zone, order) !=
+ COMPACT_SKIPPED)
+ testorder = 0;
+
+ if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+ !zone_balanced(zone, testorder,
+ balance_gap, end_zone)) {
+ shrink_zone(zone, &sc);
+
+ reclaim_state->reclaimed_slab = 0;
+ nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_scanned += sc.nr_scanned;
+
+ if (nr_slab == 0 && !zone_reclaimable(zone))
+ zone->all_unreclaimable = 1;
+ }
/*
- * There should be no need to raise the scanning
- * priority if enough pages are already being scanned
- * that that high watermark would be met at 100%
- * efficiency.
+ * If we've done a decent amount of scanning and
+ * the reclaim ratio is low, start doing writepage
+ * even in laptop mode
*/
- if (kswapd_shrink_zone(zone, end_zone, &sc,
- lru_pages, &nr_attempted))
- raise_priority = false;
+ if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+ sc.may_writepage = 1;
+
+ if (zone->all_unreclaimable) {
+ if (end_zone && end_zone == i)
+ end_zone--;
+ continue;
+ }
+
+ if (!zone_balanced(zone, testorder, 0, end_zone)) {
+ unbalanced_zone = zone;
+ /*
+ * We are still under min water mark. This
+ * means that we have a GFP_ATOMIC allocation
+ * failure risk. Hurry up!
+ */
+ if (!zone_watermark_ok_safe(zone, order,
+ min_wmark_pages(zone), end_zone, 0))
+ has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * speculatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ }
+
}
/*
@@ -3172,38 +2766,82 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
+ if (pgdat_balanced(pgdat, order, *classzone_idx))
+ break; /* kswapd: all done */
/*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
+ * OK, kswapd is getting into trouble. Take a nap, then take
+ * another pass across the zones.
*/
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
- /* Check if kswapd should be suspending */
- if (try_to_freeze() || kthread_should_stop())
- break;
+ if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
+ if (has_under_min_watermark_zone)
+ count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
+ else if (unbalanced_zone)
+ wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
+ }
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
+ * We do this so kswapd doesn't build up large priorities for
+ * example when it is freeing in parallel with allocators. It
+ * matches the direct reclaim path behaviour in terms of impact
+ * on zone->*_priority.
*/
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
+ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
+ break;
+ } while (--sc.priority >= 0);
+out:
+
+ if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
+ cond_resched();
+
+ try_to_freeze();
/*
- * Raise priority if scanning rate is too low or there was no
- * progress in reclaiming pages
+ * Fragmentation may mean that the system cannot be
+ * rebalanced for high-order allocations in all zones.
+ * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
+ * it means the zones have been fully scanned and are still
+ * not balanced. For high-order allocations, there is
+ * little point trying all over again as kswapd may
+ * infinite loop.
+ *
+ * Instead, recheck all watermarks at order-0 as they
+ * are the most important. If watermarks are ok, kswapd will go
+ * back to sleep. High-order users can still perform direct
+ * reclaim if they wish.
*/
- if (raise_priority || !sc.nr_reclaimed)
- sc.priority--;
- } while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+ order = sc.order = 0;
+
+ goto loop_again;
+ }
+
+ /*
+ * If kswapd was reclaiming at a higher order, it has the option of
+ * sleeping without all zones being balanced. Before it does, it must
+ * ensure that the watermarks for order-0 on *all* zones are met and
+ * that the congestion flags are cleared. The congestion flag must
+ * be cleared as kswapd is the only mechanism that clears the flag
+ * and it is potentially going to sleep here.
+ */
+ if (order) {
+ int zones_need_compaction = 1;
+
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ /* Check if the memory needs to be defragmented. */
+ if (zone_watermark_ok(zone, order,
+ low_wmark_pages(zone), *classzone_idx, 0))
+ zones_need_compaction = 0;
+ }
+
+ if (zones_need_compaction)
+ compact_pgdat(pgdat, order);
+ }
-out:
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
@@ -3371,10 +3009,7 @@ static int kswapd(void *p)
}
}
- tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
-
return 0;
}
@@ -3397,13 +3032,48 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
+/*
+ * The reclaimable count would be mostly accurate.
+ * The less reclaimable pages may be
+ * - mlocked pages, which will be moved to unevictable list when encountered
+ * - mapped pages, which may require several travels to be reclaimed
+ * - dirty pages, which is not "instantly" reclaimable
+ */
+unsigned long global_reclaimable_pages(void)
+{
+ int nr;
+
+ nr = global_page_state(NR_ACTIVE_FILE) +
+ global_page_state(NR_INACTIVE_FILE);
+
+ if (nr_swap_pages > 0)
+ nr += global_page_state(NR_ACTIVE_ANON) +
+ global_page_state(NR_INACTIVE_ANON);
+
+ return nr;
+}
+
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (nr_swap_pages > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
+}
+
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3426,6 +3096,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.order = 0,
.priority = DEF_PRIORITY,
};
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
@@ -3435,7 +3108,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -3485,16 +3158,16 @@ int kswapd_run(int nid)
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
+ pgdat->kswapd = NULL;
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
- pgdat->kswapd = NULL;
}
return ret;
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
@@ -3607,8 +3280,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
.may_swap = 1,
- .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .nr_to_reclaim = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
};
@@ -3646,9 +3320,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
+ *
+ * Note that shrink_slab will free memory on all zones and may
+ * take a long time.
*/
- nodes_clear(shrink.nodes_to_scan);
- node_set(zone_to_nid(zone), shrink.nodes_to_scan);
for (;;) {
unsigned long lru_pages = zone_reclaimable_pages(zone);
@@ -3697,7 +3372,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
- if (!zone_reclaimable(zone))
+ if (zone->all_unreclaimable)
return ZONE_RECLAIM_FULL;
/*
@@ -3784,7 +3459,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
- VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON(PageActive(page));
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/vmstat.c b/mm/vmstat.c