aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig482
-rw-r--r--mm/Kconfig.debug29
-rw-r--r--mm/Makefile58
-rw-r--r--mm/allocpercpu.c129
-rw-r--r--mm/backing-dev.c676
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c1009
-rw-r--r--mm/cleancache.c409
-rw-r--r--mm/compaction.c1291
-rw-r--r--mm/debug-pagealloc.c102
-rw-r--r--mm/dmapool.c505
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/fadvise.c85
-rw-r--r--mm/failslab.c60
-rw-r--r--mm/filemap.c3092
-rw-r--r--mm/filemap.h104
-rw-r--r--mm/filemap_xip.c367
-rw-r--r--mm/fremap.c275
-rw-r--r--mm/frontswap.c455
-rw-r--r--mm/gup.c662
-rw-r--r--mm/highmem.c449
-rw-r--r--mm/huge_memory.c2939
-rw-r--r--mm/hugetlb.c3751
-rw-r--r--mm/hugetlb_cgroup.c409
-rw-r--r--mm/hwpoison-inject.c142
-rw-r--r--mm/init-mm.c25
-rw-r--r--mm/internal.h346
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/iov_iter.c743
-rw-r--r--mm/kmemcheck.c122
-rw-r--r--mm/kmemleak-test.c111
-rw-r--r--mm/kmemleak.c1920
-rw-r--r--mm/ksm.c2347
-rw-r--r--mm/list_lru.c152
-rw-r--r--mm/maccess.c62
-rw-r--r--mm/madvise.c329
-rw-r--r--mm/memblock.c1591
-rw-r--r--mm/memcontrol.c7080
-rw-r--r--mm/memory-failure.c1738
-rw-r--r--mm/memory.c3689
-rw-r--r--mm/memory_hotplug.c1899
-rw-r--r--mm/mempolicy.c2524
-rw-r--r--mm/mempool.c151
-rw-r--r--mm/migrate.c1709
-rw-r--r--mm/mincore.c345
-rw-r--r--mm/mlock.c710
-rw-r--r--mm/mm_init.c205
-rw-r--r--mm/mmap.c2614
-rw-r--r--mm/mmu_context.c62
-rw-r--r--mm/mmu_notifier.c332
-rw-r--r--mm/mmzone.c81
-rw-r--r--mm/mprotect.c252
-rw-r--r--mm/mremap.c444
-rw-r--r--mm/msync.c12
-rw-r--r--mm/nobootmem.c434
-rw-r--r--mm/nommu.c1853
-rw-r--r--mm/oom_kill.c842
-rw-r--r--mm/page-writeback.c2409
-rw-r--r--mm/page_alloc.c6303
-rw-r--r--mm/page_cgroup.c529
-rw-r--r--mm/page_io.c345
-rw-r--r--mm/page_isolation.c273
-rw-r--r--mm/pagewalk.c248
-rw-r--r--mm/pdflush.c239
-rw-r--r--mm/percpu-km.c108
-rw-r--r--mm/percpu-vm.c448
-rw-r--r--mm/percpu.c1968
-rw-r--r--mm/pgtable-generic.c202
-rw-r--r--mm/prio_tree.c207
-rw-r--r--mm/process_vm_access.c379
-rw-r--r--mm/quicklist.c102
-rw-r--r--mm/readahead.c724
-rw-r--r--mm/rmap.c1685
-rw-r--r--mm/shmem.c3428
-rw-r--r--mm/slab.c3970
-rw-r--r--mm/slab.h298
-rw-r--r--mm/slab_common.c789
-rw-r--r--mm/slob.c726
-rw-r--r--mm/slub.c5374
-rw-r--r--mm/sparse-vmemmap.c235
-rw-r--r--mm/sparse.c657
-rw-r--r--mm/swap.c1086
-rw-r--r--mm/swap_state.c440
-rw-r--r--mm/swapfile.c2737
-rw-r--r--mm/thrash.c106
-rw-r--r--mm/tiny-shmem.c148
-rw-r--r--mm/truncate.c656
-rw-r--r--mm/util.c490
-rw-r--r--mm/vmacache.c132
-rw-r--r--mm/vmalloc.c2493
-rw-r--r--mm/vmpressure.c380
-rw-r--r--mm/vmscan.c3722
-rw-r--r--mm/vmstat.c1291
-rw-r--r--mm/workingset.c414
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zsmalloc.c1117
-rw-r--r--mm/zswap.c940
97 files changed, 84135 insertions, 17053 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f7..3e9977a9d65 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,6 +1,6 @@
config SELECT_MEMORY_MODEL
def_bool y
- depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL
+ depends on ARCH_SELECT_MEMORY_MODEL
choice
prompt "Memory model"
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL
Some users of more advanced features like NUMA and
memory hotplug may have different options here.
- DISCONTIGMEM is an more mature, better tested system,
+ DISCONTIGMEM is a more mature, better tested system,
but is incompatible with memory hotplug and may suffer
decreased performance over SPARSEMEM. If unsure between
"Sparse Memory" and "Discontiguous Memory", choose
@@ -37,7 +37,7 @@ config DISCONTIGMEM_MANUAL
in their physical address spaces, and this option provides
more efficient handling of these holes. However, the vast
majority of hardware has quite flat address spaces, and
- can have degraded performance from extra overhead that
+ can have degraded performance from the extra overhead that
this option imposes.
Many NUMA configurations will have this as the only option.
@@ -67,7 +67,7 @@ config DISCONTIGMEM
config SPARSEMEM
def_bool y
- depends on SPARSEMEM_MANUAL
+ depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
config FLATMEM
def_bool y
@@ -92,7 +92,7 @@ config HAVE_MEMORY_PRESENT
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called. If this can not
+# allocations when memory_present() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
@@ -101,10 +101,10 @@ config HAVE_MEMORY_PRESENT
# with gcc 3.4 and later.
#
config SPARSEMEM_STATIC
- def_bool n
+ bool
#
-# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# Architecture platforms which require a two level mem_section in SPARSEMEM
# must select this option. This is usually for architecture platforms with
# an extremely sparse physical address space.
#
@@ -112,14 +112,102 @@ config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+config SPARSEMEM_VMEMMAP_ENABLE
+ bool
+
+config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ def_bool y
+ depends on SPARSEMEM && X86_64
+
+config SPARSEMEM_VMEMMAP
+ bool "Sparse Memory virtual memmap"
+ depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
+ default y
+ help
+ SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise
+ pfn_to_page and page_to_pfn operations. This is the most
+ efficient option when sufficient kernel resources are available.
+
+config HAVE_MEMBLOCK
+ boolean
+
+config HAVE_MEMBLOCK_NODE_MAP
+ boolean
+
+config HAVE_MEMBLOCK_PHYS_MAP
+ boolean
+
+config ARCH_DISCARD_MEMBLOCK
+ boolean
+
+config NO_BOOTMEM
+ boolean
+
+config MEMORY_ISOLATION
+ boolean
+
+config MOVABLE_NODE
+ boolean "Enable to assign a node which has only movable memory"
+ depends on HAVE_MEMBLOCK
+ depends on NO_BOOTMEM
+ depends on X86_64
+ depends on NUMA
+ default n
+ help
+ Allow a node to have only movable memory. Pages used by the kernel,
+ such as direct mapping pages cannot be migrated. So the corresponding
+ memory device cannot be hotplugged. This option allows the following
+ two things:
+ - When the system is booting, node full of hotpluggable memory can
+ be arranged to have only movable memory so that the whole node can
+ be hot-removed. (need movable_node boot option specified).
+ - After the system is up, the option allows users to online all the
+ memory of a node as movable memory so that the whole node can be
+ hot-removed.
+
+ Users who don't use the memory hotplug feature are fine with this
+ option on since they don't specify movable_node boot option or they
+ don't online memory as movable.
+
+ Say Y here if you want to hotplug a whole node.
+ Say N here if you want kernel to use memory on all nodes evenly.
+
+#
+# Only be set on architectures that have completely implemented memory hotplug
+# feature. If you are not sure, don't touch it.
+#
+config HAVE_BOOTMEM_INFO_NODE
+ def_bool n
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
- depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC64)
+ depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
-comment "Memory hotplug is currently incompatible with Software Suspend"
- depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+config MEMORY_HOTPLUG_SPARSE
+ def_bool y
+ depends on SPARSEMEM && MEMORY_HOTPLUG
+
+config MEMORY_HOTREMOVE
+ bool "Allow for memory hot remove"
+ select MEMORY_ISOLATION
+ select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
+ depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on MIGRATION
+
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+ def_bool y
+ depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
@@ -127,28 +215,380 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
config SPLIT_PTLOCK_CPUS
int
- default "4096" if ARM && !CPU_CACHE_VIPT
- default "4096" if PARISC && !PA20
+ default "999999" if !MMU
+ default "999999" if ARM && !CPU_CACHE_VIPT
+ default "999999" if PARISC && !PA20
default "4"
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ boolean
+
+#
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+ bool "Allow for balloon memory compaction/migration"
+ def_bool y
+ depends on COMPACTION && VIRTIO_BALLOON
+ help
+ Memory fragmentation introduced by ballooning might reduce
+ significantly the number of 2MB contiguous memory blocks that can be
+ used within a guest, thus imposing performance penalties associated
+ with the reduced number of transparent huge pages that could be used
+ by the guest workload. Allowing the compaction & migration for memory
+ pages enlisted as being part of memory balloon devices avoids the
+ scenario aforementioned and helps improving memory defragmentation.
+
+#
+# support for memory compaction
+config COMPACTION
+ bool "Allow for memory compaction"
+ def_bool y
+ select MIGRATION
+ depends on MMU
+ help
+ Allows the compaction of memory for the allocation of huge pages.
+
#
# support for page migration
#
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA
+ depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
help
Allows the migration of the physical location of pages of processes
- while the virtual addresses are not changed. This is useful for
- example on NUMA systems to put pages nearer to the processors accessing
- the page.
+ while the virtual addresses are not changed. This is useful in
+ two situations. The first is on NUMA systems to put pages nearer
+ to the processors accessing. The second is when allocating huge
+ pages as migration can relocate pages to satisfy a huge page
+ allocation instead of reclaiming.
+
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ boolean
+
+config PHYS_ADDR_T_64BIT
+ def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
+
+config ZONE_DMA_FLAG
+ int
+ default "0" if !ZONE_DMA
+ default "1"
+
+config BOUNCE
+ bool "Enable bounce buffers"
+ default y
+ depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM)
+ help
+ Enable bounce buffers for devices that cannot access
+ the full range of memory available to the CPU. Enabled
+ by default when ZONE_DMA or HIGHMEM is selected, but you
+ may say n to override this.
+
+# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often
+# have more than 4GB of memory, but we don't currently use the IOTLB to present
+# a 32-bit address to OHCI. So we need to use a bounce pool instead.
+#
+# We also use the bounce pool to provide stable page writes for jbd. jbd
+# initiates buffer writeback without locking the page or setting PG_writeback,
+# and fixing that behavior (a second time; jbd2 doesn't have this problem) is
+# a major rework effort. Instead, use the bounce buffer to snapshot pages
+# (until jbd goes away). The only jbd user is ext3.
+config NEED_BOUNCE_POOL
+ bool
+ default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD)
+
+config NR_QUICK
+ int
+ depends on QUICKLIST
+ default "2" if AVR32
+ default "1"
+
+config VIRT_TO_BUS
+ bool
+ help
+ An architecture should select this if it implements the
+ deprecated interface virt_to_bus(). All new architectures
+ should probably not select this.
+
+
+config MMU_NOTIFIER
+ bool
+
+config KSM
+ bool "Enable KSM for page merging"
+ depends on MMU
+ help
+ Enable Kernel Samepage Merging: KSM periodically scans those areas
+ of an application's address space that an app has advised may be
+ mergeable. When it finds pages of identical content, it replaces
+ the many instances by a single page with that content, so
+ saving memory until one or another app needs to modify the content.
+ Recommended for use with KVM, or with other duplicative applications.
+ See Documentation/vm/ksm.txt for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
-config RESOURCES_64BIT
- bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
- default 64BIT
+config DEFAULT_MMAP_MIN_ADDR
+ int "Low address space to protect from user allocation"
+ depends on MMU
+ default 4096
+ help
+ This is the portion of low virtual memory which should be protected
+ from userspace allocation. Keeping a user from writing to low pages
+ can help reduce the impact of kernel NULL pointer bugs.
+
+ For most ia64, ppc64 and x86 users with lots of address space
+ a value of 65536 is reasonable and should cause no problems.
+ On arm and other archs it should not be higher than 32768.
+ Programs which use vm86 functionality or have some need to map
+ this low address space will need CAP_SYS_RAWIO or disable this
+ protection by setting the value to 0.
+
+ This value can be changed after boot using the
+ /proc/sys/vm/mmap_min_addr tunable.
+
+config ARCH_SUPPORTS_MEMORY_FAILURE
+ bool
+
+config MEMORY_FAILURE
+ depends on MMU
+ depends on ARCH_SUPPORTS_MEMORY_FAILURE
+ bool "Enable recovery from hardware memory errors"
+ select MEMORY_ISOLATION
+ help
+ Enables code to recover from some memory failures on systems
+ with MCA recovery. This allows a system to continue running
+ even when some of its memory has uncorrected errors. This requires
+ special hardware support and typically ECC memory.
+
+config HWPOISON_INJECT
+ tristate "HWPoison pages injector"
+ depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
+ select PROC_PAGE_MONITOR
+
+config NOMMU_INITIAL_TRIM_EXCESS
+ int "Turn on mmap() excess space trimming before booting"
+ depends on !MMU
+ default 1
+ help
+ The NOMMU mmap() frequently needs to allocate large contiguous chunks
+ of memory on which to store mappings, but it can only ask the system
+ allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+ more than it requires. To deal with this, mmap() is able to trim off
+ the excess and return it to the allocator.
+
+ If trimming is enabled, the excess is trimmed off and returned to the
+ system allocator, which can cause extra fragmentation, particularly
+ if there are a lot of transient processes.
+
+ If trimming is disabled, the excess is kept, but not used, which for
+ long-term mappings means that the space is wasted.
+
+ Trimming can be dynamically controlled through a sysctl option
+ (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+ excess pages there must be before trimming should occur, or zero if
+ no trimming is to occur.
+
+ This option specifies the initial value of this option. The default
+ of 1 says that all excess pages should be trimmed.
+
+ See Documentation/nommu-mmap.txt for more information.
+
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
+
+config CLEANCACHE
+ bool "Enable cleancache driver to cache clean pages if tmem is present"
+ default n
+ help
+ Cleancache can be thought of as a page-granularity victim cache
+ for clean pages that the kernel's pageframe replacement algorithm
+ (PFRA) would like to keep around, but can't since there isn't enough
+ memory. So when the PFRA "evicts" a page, it first attempts to use
+ cleancache code to put the data contained in that page into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. And when a cleancache-enabled
+ filesystem wishes to access a page in a file on disk, it first
+ checks cleancache to see if it already contains it; if it does,
+ the page is copied into the kernel and a disk access is avoided.
+ When a transcendent memory driver is available (such as zcache or
+ Xen transcendent memory), a significant I/O reduction
+ may be achieved. When none is available, all cleancache calls
+ are reduced to a single pointer-compare-against-NULL resulting
+ in a negligible performance hit.
+
+ If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
+
+config CMA
+ bool "Contiguous Memory Allocator"
+ depends on HAVE_MEMBLOCK && MMU
+ select MIGRATION
+ select MEMORY_ISOLATION
+ help
+ This enables the Contiguous Memory Allocator which allows other
+ subsystems to allocate big physically-contiguous blocks of memory.
+ CMA reserves a region of memory and allows only movable pages to
+ be allocated from it. This way, the kernel can use the memory for
+ pagecache and when a subsystem requests for contiguous area, the
+ allocated pages are migrated away to serve the contiguous request.
+
+ If unsure, say "n".
+
+config CMA_DEBUG
+ bool "CMA debug messages (DEVELOPMENT)"
+ depends on DEBUG_KERNEL && CMA
+ help
+ Turns on debug messages in CMA. This produces KERN_DEBUG
+ messages for every CMA call as well as various messages while
+ processing calls such as dma_alloc_from_contiguous().
+ This option does not affect warning and error messages.
+
+config ZBUD
+ tristate
+ default n
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to two compressed pages per physical
+ page. While this design limits storage density, it has simple and
+ deterministic reclaim properties that make it preferable to a higher
+ density approach when reclaim will be used.
+
+config ZSWAP
+ bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ depends on FRONTSWAP && CRYPTO=y
+ select CRYPTO_LZO
+ select ZBUD
+ default n
help
- This option allows memory and IO resources to be 64 bit.
+ A lightweight compressed cache for swap pages. It takes
+ pages that are in the process of being swapped out and attempts to
+ compress them into a dynamically allocated RAM-based memory pool.
+ This can result in a significant I/O reduction on swap device and,
+ in the case where decompressing from RAM is faster that swap device
+ reads, can also improve workload performance.
+
+ This is marked experimental because it is a new feature (as of
+ v3.11) that interacts heavily with memory reclaim. While these
+ interactions don't cause any known issues on simple memory setups,
+ they have not be fully explored on the large set of potential
+ configurations and workloads that exist.
+
+config MEM_SOFT_DIRTY
+ bool "Track memory changes"
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
+ select PROC_PAGE_MONITOR
+ help
+ This option enables memory changes tracking by introducing a
+ soft-dirty bit on pte-s. This bit it set when someone writes
+ into a page just as regular dirty bit, but unlike the latter
+ it can be cleared by hands.
+
+ See Documentation/vm/soft-dirty.txt for more details.
+
+config ZSMALLOC
+ tristate "Memory allocator for compressed pages"
+ depends on MMU
+ default n
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ compressed RAM pages. zsmalloc uses virtual memory mapping
+ in order to reduce fragmentation. However, this results in a
+ non-standard allocator interface where a handle, not a pointer, is
+ returned by an alloc(). This handle must be mapped in order to
+ access the allocated space.
+
+config PGTABLE_MAPPING
+ bool "Use page table mapping to access object in zsmalloc"
+ depends on ZSMALLOC
+ help
+ By default, zsmalloc uses a copy-based object mapping method to
+ access allocations that span two pages. However, if a particular
+ architecture (ex, ARM) performs VM mapping faster than copying,
+ then you should select this. This causes zsmalloc to use page table
+ mapping rather than copying for object mapping.
+
+ You can check speed with zsmalloc benchmark:
+ https://github.com/spartacus06/zsmapbench
+
+config GENERIC_EARLY_IOREMAP
+ bool
+
+config MAX_STACK_SIZE_MB
+ int "Maximum user stack size for 32-bit processes (MB)"
+ default 80
+ range 8 256 if METAG
+ range 8 2048
+ depends on STACK_GROWSUP && (!64BIT || COMPAT)
+ help
+ This is the maximum stack size in Megabytes in the VM layout of 32-bit
+ user processes when the stack grows upwards (currently only on parisc
+ and metag arch). The stack will be located at the highest memory
+ address minus the given value, unless the RLIMIT_STACK hard limit is
+ changed to a smaller value in which case that is used.
+
+ A sane initial value is 80 MB.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 00000000000..4b2443254de
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,29 @@
+config DEBUG_PAGEALLOC
+ bool "Debug page memory allocations"
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
+ depends on !KMEMCHECK
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ ---help---
+ Unmap pages from the kernel linear mapping after free_pages().
+ This results in a large slowdown, but helps to find certain types
+ of memory corruption.
+
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally,
+ this option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
+
+config WANT_PAGE_DEBUG_FLAGS
+ bool
+
+config PAGE_POISONING
+ bool
+ select WANT_PAGE_DEBUG_FLAGS
+
+config PAGE_GUARD
+ bool
+ select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/Makefile b/mm/Makefile
index 60c56c0b5e1..4064f3ec145 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,24 +3,62 @@
#
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o pagewalk.o pgtable-generic.o
-obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- page_alloc.o page-writeback.o pdflush.o \
- readahead.o swap.o truncate.o vmscan.o \
- prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU) += process_vm_access.o
+endif
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+ maccess.o page_alloc.o page-writeback.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
+ mm_init.o mmu_context.o percpu.o slab_common.o \
+ compaction.o balloon_compaction.o vmacache.o \
+ interval_tree.o list_lru.o workingset.o \
+ iov_iter.o $(mmu-y)
+
+obj-y += init-mm.o
+
+ifdef CONFIG_NO_BOOTMEM
+ obj-y += nobootmem.o
+else
+ obj-y += bootmem.o
+endif
+
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
+obj-$(CONFIG_ZSWAP) += zswap.o
+obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
-obj-$(CONFIG_SHMEM) += shmem.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
+obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_SMP) += allocpercpu.o
+obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
+obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZBUD) += zbud.o
+obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
deleted file mode 100644
index eaa9abeea53..00000000000
--- a/mm/allocpercpu.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * linux/mm/allocpercpu.c
- *
- * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com>
- */
-#include <linux/mm.h>
-#include <linux/module.h>
-
-/**
- * percpu_depopulate - depopulate per-cpu data for given cpu
- * @__pdata: per-cpu data to depopulate
- * @cpu: depopulate per-cpu data for this cpu
- *
- * Depopulating per-cpu data for a cpu going offline would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- */
-void percpu_depopulate(void *__pdata, int cpu)
-{
- struct percpu_data *pdata = __percpu_disguise(__pdata);
- if (pdata->ptrs[cpu]) {
- kfree(pdata->ptrs[cpu]);
- pdata->ptrs[cpu] = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(percpu_depopulate);
-
-/**
- * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
- * @__pdata: per-cpu data to depopulate
- * @mask: depopulate per-cpu data for cpu's selected through mask bits
- */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
-{
- int cpu;
- for_each_cpu_mask(cpu, *mask)
- percpu_depopulate(__pdata, cpu);
-}
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
-
-/**
- * percpu_populate - populate per-cpu data for given cpu
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @cpu: populate per-data for this cpu
- *
- * Populating per-cpu data for a cpu coming online would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- * Per-cpu object is populated with zeroed buffer.
- */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
-{
- struct percpu_data *pdata = __percpu_disguise(__pdata);
- int node = cpu_to_node(cpu);
-
- BUG_ON(pdata->ptrs[cpu]);
- if (node_online(node)) {
- /* FIXME: kzalloc_node(size, gfp, node) */
- pdata->ptrs[cpu] = kmalloc_node(size, gfp, node);
- if (pdata->ptrs[cpu])
- memset(pdata->ptrs[cpu], 0, size);
- } else
- pdata->ptrs[cpu] = kzalloc(size, gfp);
- return pdata->ptrs[cpu];
-}
-EXPORT_SYMBOL_GPL(percpu_populate);
-
-/**
- * percpu_populate_mask - populate per-cpu data for more cpu's
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-cpu data for cpu's selected through mask bits
- *
- * Per-cpu objects are populated with zeroed buffers.
- */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
- cpumask_t *mask)
-{
- cpumask_t populated = CPU_MASK_NONE;
- int cpu;
-
- for_each_cpu_mask(cpu, *mask)
- if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
- __percpu_depopulate_mask(__pdata, &populated);
- return -ENOMEM;
- } else
- cpu_set(cpu, populated);
- return 0;
-}
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
-
-/**
- * percpu_alloc_mask - initial setup of per-cpu data
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
- *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
- */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
-{
- void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
- void *__pdata = __percpu_disguise(pdata);
-
- if (unlikely(!pdata))
- return NULL;
- if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
- return __pdata;
- kfree(pdata);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
-
-/**
- * percpu_free - final cleanup of per-cpu data
- * @__pdata: object to clean up
- *
- * We simply clean up any per-cpu object left. No need for the client to
- * track and specify through a bis mask which per-cpu objects are to free.
- */
-void percpu_free(void *__pdata)
-{
- __percpu_depopulate_mask(__pdata, &cpu_possible_map);
- kfree(__percpu_disguise(__pdata));
-}
-EXPORT_SYMBOL_GPL(percpu_free);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 00000000000..1706cbbdf5f
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,676 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/device.h>
+#include <trace/events/writeback.h>
+
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+struct backing_dev_info default_backing_dev_info = {
+ .name = "default",
+ .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
+ .state = 0,
+ .capabilities = BDI_CAP_MAP_COPY,
+};
+EXPORT_SYMBOL_GPL(default_backing_dev_info);
+
+struct backing_dev_info noop_backing_dev_info = {
+ .name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
+
+static struct class *bdi_class;
+
+/*
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
+ * locking.
+ */
+DEFINE_SPINLOCK(bdi_lock);
+LIST_HEAD(bdi_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
+
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bdi_debug_root;
+
+static void bdi_debug_init(void)
+{
+ bdi_debug_root = debugfs_create_dir("bdi", NULL);
+}
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ struct bdi_writeback *wb = &bdi->wb;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
+ unsigned long nr_dirty, nr_io, nr_more_io;
+ struct inode *inode;
+
+ nr_dirty = nr_io = nr_more_io = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+ nr_more_io++;
+ spin_unlock(&wb->list_lock);
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+ seq_printf(m,
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiDirtied: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
+ (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
+#undef K
+
+ return 0;
+}
+
+static int bdi_debug_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, bdi_debug_stats_show, inode->i_private);
+}
+
+static const struct file_operations bdi_debug_stats_fops = {
+ .open = bdi_debug_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+{
+ bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
+ bdi, &bdi_debug_stats_fops);
+}
+
+static void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+ debugfs_remove(bdi->debug_stats);
+ debugfs_remove(bdi->debug_dir);
+}
+#else
+static inline void bdi_debug_init(void)
+{
+}
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
+ const char *name)
+{
+}
+static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
+{
+}
+#endif
+
+static ssize_t read_ahead_kb_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned long read_ahead_kb;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
+}
+
+#define K(pages) ((pages) << (PAGE_SHIFT - 10))
+
+#define BDI_SHOW(name, expr) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *page) \
+{ \
+ struct backing_dev_info *bdi = dev_get_drvdata(dev); \
+ \
+ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+} \
+static DEVICE_ATTR_RW(name);
+
+BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
+
+static ssize_t min_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(min_ratio, bdi->min_ratio)
+
+static ssize_t max_ratio_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+ unsigned int ratio;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
+
+ return ret;
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
+
+static ssize_t stable_pages_required_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n",
+ bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+}
+static DEVICE_ATTR_RO(stable_pages_required);
+
+static struct attribute *bdi_dev_attrs[] = {
+ &dev_attr_read_ahead_kb.attr,
+ &dev_attr_min_ratio.attr,
+ &dev_attr_max_ratio.attr,
+ &dev_attr_stable_pages_required.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(bdi_dev);
+
+static __init int bdi_class_init(void)
+{
+ bdi_class = class_create(THIS_MODULE, "bdi");
+ if (IS_ERR(bdi_class))
+ return PTR_ERR(bdi_class);
+
+ bdi_class->dev_groups = bdi_dev_groups;
+ bdi_debug_init();
+ return 0;
+}
+postcore_initcall(bdi_class_init);
+
+static int __init default_bdi_init(void)
+{
+ int err;
+
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!bdi_wq)
+ return -ENOMEM;
+
+ err = bdi_init(&default_backing_dev_info);
+ if (!err)
+ bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
+
+ return err;
+}
+subsys_initcall(default_bdi_init);
+
+int bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+ return wb_has_dirty_io(&bdi->wb);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ spin_lock_bh(&bdi->wb_lock);
+ if (test_bit(BDI_registered, &bdi->state))
+ queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ synchronize_rcu_expedited();
+}
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct device *dev;
+
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
+
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ bdi->dev = dev;
+
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(BDI_registered, &bdi->state);
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
+}
+EXPORT_SYMBOL(bdi_register);
+
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+{
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+/*
+ * Remove bdi from the global list and shutdown any threads we have running
+ */
+static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+{
+ if (!bdi_cap_writeback_dirty(bdi))
+ return;
+
+ /*
+ * Make sure nobody finds us on the bdi_list anymore
+ */
+ bdi_remove_from_list(bdi);
+
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ clear_bit(BDI_registered, &bdi->state);
+ spin_unlock_bh(&bdi->wb_lock);
+
+ /*
+ * Drain work list and shutdown the delayed_work. At this point,
+ * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+ * is dying and its work_list needs to be drained no matter what.
+ */
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ flush_delayed_work(&bdi->wb.dwork);
+ WARN_ON(!list_empty(&bdi->work_list));
+
+ /*
+ * This shouldn't be necessary unless @bdi for some reason has
+ * unflushed dirty IO after work_list is drained. Do it anyway
+ * just in case.
+ */
+ cancel_delayed_work_sync(&bdi->wb.dwork);
+}
+
+/*
+ * This bdi is going away now, make sure that no super_blocks point to it
+ */
+static void bdi_prune_sb(struct backing_dev_info *bdi)
+{
+ struct super_block *sb;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_bdi == bdi)
+ sb->s_bdi = &default_backing_dev_info;
+ }
+ spin_unlock(&sb_lock);
+}
+
+void bdi_unregister(struct backing_dev_info *bdi)
+{
+ struct device *dev = bdi->dev;
+
+ if (dev) {
+ bdi_set_min_ratio(bdi, 0);
+ trace_writeback_bdi_unregister(bdi);
+ bdi_prune_sb(bdi);
+
+ bdi_wb_shutdown(bdi);
+ bdi_debug_unregister(bdi);
+
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->dev = NULL;
+ spin_unlock_bh(&bdi->wb_lock);
+
+ device_unregister(dev);
+ }
+}
+EXPORT_SYMBOL(bdi_unregister);
+
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
+ INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+}
+
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
+int bdi_init(struct backing_dev_info *bdi)
+{
+ int i, err;
+
+ bdi->dev = NULL;
+
+ bdi->min_ratio = 0;
+ bdi->max_ratio = 100;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
+ spin_lock_init(&bdi->wb_lock);
+ INIT_LIST_HEAD(&bdi->bdi_list);
+ INIT_LIST_HEAD(&bdi->work_list);
+
+ bdi_wb_init(&bdi->wb, bdi);
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ if (err)
+ goto err;
+ }
+
+ bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->balanced_dirty_ratelimit = INIT_BW;
+ bdi->dirty_ratelimit = INIT_BW;
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
+ err = fprop_local_init_percpu(&bdi->completions);
+
+ if (err) {
+err:
+ while (i--)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(bdi_init);
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ int i;
+
+ /*
+ * Splice our entries to the default_backing_dev_info, if this
+ * bdi disappears
+ */
+ if (bdi_has_dirty_io(bdi)) {
+ struct bdi_writeback *dst = &default_backing_dev_info.wb;
+
+ bdi_lock_two(&bdi->wb, dst);
+ list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
+ list_splice(&bdi->wb.b_io, &dst->b_io);
+ list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
+ }
+
+ bdi_unregister(bdi);
+
+ /*
+ * If bdi_unregister() had already been called earlier, the dwork
+ * could still be pending because bdi_prune_sb() can race with the
+ * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
+ */
+ cancel_delayed_work_sync(&bdi->wb.dwork);
+
+ for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+ percpu_counter_destroy(&bdi->bdi_stat[i]);
+
+ fprop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
+
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+ unsigned int cap)
+{
+ int err;
+
+ bdi->name = name;
+ bdi->capabilities = cap;
+ err = bdi_init(bdi);
+ if (err)
+ return err;
+
+ err = bdi_register(bdi, NULL, "%.28s-%ld", name,
+ atomic_long_inc_return(&bdi_seq));
+ if (err) {
+ bdi_destroy(bdi);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
+
+static wait_queue_head_t congestion_wqh[2] = {
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+ __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+ };
+static atomic_t nr_bdi_congested[2];
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ enum bdi_state bit;
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
+ smp_mb__after_atomic();
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+ enum bdi_state bit;
+
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion. If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
+
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char kbuf[] = "0\n";
+
+ if (*ppos || *lenp < sizeof(kbuf)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+ return -EFAULT;
+ printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+ table->procname);
+
+ *lenp = 2;
+ *ppos += *lenp;
+ return 2;
+}
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 00000000000..6e45a5074bf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ * this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+ struct balloon_dev_info *b_dev_info;
+ b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+ if (!b_dev_info)
+ return ERR_PTR(-ENOMEM);
+
+ b_dev_info->balloon_device = balloon_dev_descriptor;
+ b_dev_info->mapping = NULL;
+ b_dev_info->isolated_pages = 0;
+ spin_lock_init(&b_dev_info->pages_lock);
+ INIT_LIST_HEAD(&b_dev_info->pages);
+
+ return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ * page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+ unsigned long flags;
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
+ if (!page)
+ return NULL;
+
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point.
+ */
+ BUG_ON(!trylock_page(page));
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ * the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ bool dequeued_page;
+
+ dequeued_page = false;
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ /*
+ * Block others from accessing the 'page' while we get around
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (trylock_page(page)) {
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ /*
+ * Raise the page refcount here to prevent any wrong
+ * attempt to isolate this page, in case of coliding
+ * with balloon_page_isolate() just after we release
+ * the page lock.
+ *
+ * balloon_page_free() will take care of dropping
+ * this extra refcount later.
+ */
+ get_page(page);
+ balloon_page_delete(page);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ dequeued_page = true;
+ break;
+ }
+ }
+
+ if (!dequeued_page) {
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there is no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck into
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ if (unlikely(list_empty(&b_dev_info->pages) &&
+ !b_dev_info->isolated_pages))
+ BUG();
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+ const struct address_space_operations *a_ops)
+{
+ struct address_space *mapping;
+
+ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Give a clean 'zeroed' status to all elements of this special
+ * balloon page->mapping struct address_space instance.
+ */
+ address_space_init_once(mapping);
+
+ /*
+ * Set mapping->flags appropriately, to allow balloon pages
+ * ->mapping identification.
+ */
+ mapping_set_balloon(mapping);
+ mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+
+ /* balloon's page->mapping->a_ops callback descriptor */
+ mapping->a_ops = a_ops;
+
+ /*
+ * Establish a pointer reference back to the balloon device descriptor
+ * this particular page->mapping will be servicing.
+ * This is used by compaction / migration procedures to identify and
+ * access the balloon device pageset while isolating / migrating pages.
+ *
+ * As some balloon drivers can register multiple balloon devices
+ * for a single guest, this also helps compaction / migration to
+ * properly deal with multiple balloon pagesets, when required.
+ */
+ mapping->private_data = b_dev_info;
+ b_dev_info->mapping = mapping;
+
+ return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+
+static inline void __isolate_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_del(&page->lru);
+ b_dev_info->isolated_pages++;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline void __putback_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_add(&page->lru, &b_dev_info->pages);
+ b_dev_info->isolated_pages--;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline int __migrate_balloon_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a balloon page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * As balloon pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the balloon driver releasing a page.
+ *
+ * In order to avoid having an already isolated balloon page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released by
+ * the balloon driver, lets be sure we have the page lock
+ * before proceeding with the balloon page isolation steps.
+ */
+ if (likely(trylock_page(page))) {
+ /*
+ * A ballooned page, by default, has just one refcount.
+ * Prevent concurrent compaction threads from isolating
+ * an already isolated balloon page by refcount check.
+ */
+ if (__is_movable_balloon_page(page) &&
+ page_count(page) == 2) {
+ __isolate_balloon_page(page);
+ unlock_page(page);
+ return true;
+ }
+ unlock_page(page);
+ }
+ put_page(page);
+ }
+ return false;
+}
+
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+ /*
+ * 'lock_page()' stabilizes the page and prevents races against
+ * concurrent isolation threads attempting to re-isolate it.
+ */
+ lock_page(page);
+
+ if (__is_movable_balloon_page(page)) {
+ __putback_balloon_page(page);
+ /* drop the extra ref count taken for page isolation */
+ put_page(page);
+ } else {
+ WARN_ON(1);
+ dump_page(page, "not movable balloon page");
+ }
+ unlock_page(page);
+}
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct address_space *mapping;
+ int rc = -EAGAIN;
+
+ /*
+ * Block others from accessing the 'newpage' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'newpage' at this point.
+ */
+ BUG_ON(!trylock_page(newpage));
+
+ if (WARN_ON(!__is_movable_balloon_page(page))) {
+ dump_page(page, "not movable balloon page");
+ unlock_page(newpage);
+ return rc;
+ }
+
+ mapping = page->mapping;
+ if (mapping)
+ rc = __migrate_balloon_page(mapping, newpage, page, mode);
+
+ unlock_page(newpage);
+ return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d53112fcb40..90bd3507b41 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,17 +1,21 @@
/*
- * linux/mm/bootmem.c
+ * bootmem - A boot-time physical memory allocator and configurator
*
* Copyright (C) 1999 Ingo Molnar
- * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
*
- * simple boot-time physical memory area allocator and
- * free memory collector. It's used to deal with reserved
- * system memory and memory holes as well.
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
*/
#include <linux/init.h>
#include <linux/pfn.h>
+#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -19,35 +23,53 @@
#include "internal.h"
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+ .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-static LIST_HEAD(bdata_list);
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
-/* return the number of _pages_ that will be allocated for the boot bitmap */
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+static int bootmem_debug;
+
+static int __init bootmem_debug_setup(char *buf)
{
- unsigned long mapsize;
+ bootmem_debug = 1;
+ return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
- mapsize = (pages+7)/8;
- mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
- mapsize >>= PAGE_SHIFT;
+#define bdebug(fmt, args...) ({ \
+ if (unlikely(bootmem_debug)) \
+ printk(KERN_INFO \
+ "bootmem::%s " fmt, \
+ __func__, ## args); \
+})
- return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+ unsigned long bytes = DIV_ROUND_UP(pages, 8);
+
+ return ALIGN(bytes, sizeof(long));
+}
+
+/**
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
+ */
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+{
+ unsigned long bytes = bootmap_bytes(pages);
+
+ return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
}
/*
@@ -57,44 +79,27 @@ static void __init link_bootmem(bootmem_data_t *bdata)
{
bootmem_data_t *ent;
- if (list_empty(&bdata_list)) {
- list_add(&bdata->list, &bdata_list);
- return;
- }
- /* insert in order */
list_for_each_entry(ent, &bdata_list, list) {
- if (bdata->node_boot_start < ent->node_boot_start) {
+ if (bdata->node_min_pfn < ent->node_min_pfn) {
list_add_tail(&bdata->list, &ent->list);
return;
}
}
- list_add_tail(&bdata->list, &bdata_list);
-}
-
-/*
- * Given an initialised bdata, it returns the size of the boot bitmap
- */
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
-{
- unsigned long mapsize;
- unsigned long start = PFN_DOWN(bdata->node_boot_start);
- unsigned long end = bdata->node_low_pfn;
- mapsize = ((end - start) + 7) / 8;
- return ALIGN(mapsize, sizeof(long));
+ list_add_tail(&bdata->list, &bdata_list);
}
/*
* Called once to set up the allocator itself.
*/
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
unsigned long mapstart, unsigned long start, unsigned long end)
{
- bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize;
+ mminit_validate_memmodel_limits(&start, &end);
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
- bdata->node_boot_start = PFN_PHYS(start);
+ bdata->node_min_pfn = start;
bdata->node_low_pfn = end;
link_bootmem(bdata);
@@ -102,334 +107,563 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
* Initially all pages are reserved - setup_arch() has to
* register free RAM areas explicitly.
*/
- mapsize = get_mapsize(bdata);
+ mapsize = bootmap_bytes(end - start);
memset(bdata->node_bootmem_map, 0xff, mapsize);
+ bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+ bdata - bootmem_node_data, start, mapstart, end, mapsize);
+
return mapsize;
}
+/**
+ * init_bootmem_node - register a node as boot memory
+ * @pgdat: node to register
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
+ */
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+ unsigned long startpfn, unsigned long endpfn)
+{
+ return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
+}
+
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+ max_low_pfn = pages;
+ min_low_pfn = start;
+ return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
+
/*
- * Marks a particular physical memory range as unallocatable. Usable RAM
- * might be used for boot-time allocations - or it might get added
- * to the free page pool later on.
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting physical address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
*/
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
- unsigned long size)
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
- unsigned long sidx, eidx;
- unsigned long i;
+ unsigned long cursor, end;
- /*
- * round up, partially reserved pages are considered
- * fully reserved.
- */
- BUG_ON(!size);
- BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
- BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
+ kmemleak_free_part(__va(physaddr), size);
- sidx = PFN_DOWN(addr - bdata->node_boot_start);
- eidx = PFN_UP(addr + size - bdata->node_boot_start);
+ cursor = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
- for (i = sidx; i < eidx; i++)
- if (test_and_set_bit(i, bdata->node_bootmem_map)) {
-#ifdef CONFIG_DEBUG_BOOTMEM
- printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
-#endif
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+ struct page *page;
+ unsigned long *map, start, end, pages, count = 0;
+
+ if (!bdata->node_bootmem_map)
+ return 0;
+
+ map = bdata->node_bootmem_map;
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
+ bdebug("nid=%td start=%lx end=%lx\n",
+ bdata - bootmem_node_data, start, end);
+
+ while (start < end) {
+ unsigned long idx, vec;
+ unsigned shift;
+
+ idx = start - bdata->node_min_pfn;
+ shift = idx & (BITS_PER_LONG - 1);
+ /*
+ * vec holds at most BITS_PER_LONG map bits,
+ * bit 0 corresponds to start.
+ */
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (shift) {
+ vec >>= shift;
+ if (end - start >= BITS_PER_LONG)
+ vec |= ~map[idx / BITS_PER_LONG + 1] <<
+ (BITS_PER_LONG - shift);
}
+ /*
+ * If we have a properly aligned and fully unreserved
+ * BITS_PER_LONG block of pages in front of us, free
+ * it in one go.
+ */
+ if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
+ int order = ilog2(BITS_PER_LONG);
+
+ __free_pages_bootmem(pfn_to_page(start), order);
+ count += BITS_PER_LONG;
+ start += BITS_PER_LONG;
+ } else {
+ unsigned long cur = start;
+
+ start = ALIGN(start + 1, BITS_PER_LONG);
+ while (vec && cur != start) {
+ if (vec & 1) {
+ page = pfn_to_page(cur);
+ __free_pages_bootmem(page, 0);
+ count++;
+ }
+ vec >>= 1;
+ ++cur;
+ }
+ }
+ }
+
+ page = virt_to_page(bdata->node_bootmem_map);
+ pages = bdata->node_low_pfn - bdata->node_min_pfn;
+ pages = bootmem_bootmap_pages(pages);
+ count += pages;
+ while (pages--)
+ __free_pages_bootmem(page++, 0);
+
+ bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+
+ return count;
}
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
- unsigned long size)
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
{
- unsigned long sidx, eidx;
- unsigned long i;
+ struct zone *z;
- /*
- * round down end of usable mem, partially free pages are
- * considered reserved.
- */
- BUG_ON(!size);
- BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn);
+ if (reset_managed_pages_done)
+ return;
- if (addr < bdata->last_success)
- bdata->last_success = addr;
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ z->managed_pages = 0;
+}
- /*
- * Round up the beginning of the address.
- */
- sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
- eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+void __init reset_all_zones_managed_pages(void)
+{
+ struct pglist_data *pgdat;
- for (i = sidx; i < eidx; i++) {
- if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
- BUG();
- }
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+ reset_managed_pages_done = 1;
}
-/*
- * We 'merge' subsequent allocations to save space. We might 'lose'
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases (mostly large memory boxes) this
- * is not a problem.
- *
- * On low memory boxes we get it right in 100% of the cases.
- *
- * alignment has to be a power of 2 value.
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
*
- * NOTE: This function is _not_ reentrant.
+ * Returns the number of pages actually released.
*/
-void * __init
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
- unsigned long align, unsigned long goal, unsigned long limit)
+unsigned long __init free_all_bootmem(void)
{
- unsigned long offset, remaining_size, areasize, preferred;
- unsigned long i, start = 0, incr, eidx, end_pfn;
- void *ret;
-
- if (!size) {
- printk("__alloc_bootmem_core(): zero-sized request\n");
- BUG();
- }
- BUG_ON(align & (align-1));
+ unsigned long total_pages = 0;
+ bootmem_data_t *bdata;
- if (limit && bdata->node_boot_start >= limit)
- return NULL;
+ reset_all_zones_managed_pages();
- end_pfn = bdata->node_low_pfn;
- limit = PFN_DOWN(limit);
- if (limit && end_pfn > limit)
- end_pfn = limit;
+ list_for_each_entry(bdata, &bdata_list, list)
+ total_pages += free_all_bootmem_core(bdata);
- eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
- offset = 0;
- if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
- offset = align - (bdata->node_boot_start & (align - 1UL));
- offset = PFN_DOWN(offset);
+ totalram_pages += total_pages;
- /*
- * We try to allocate bootmem pages above 'goal'
- * first, then we try to allocate lower pages.
- */
- if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
- preferred = goal - bdata->node_boot_start;
-
- if (bdata->last_success >= preferred)
- if (!limit || (limit && limit > bdata->last_success))
- preferred = bdata->last_success;
- } else
- preferred = 0;
-
- preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
- areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
- incr = align >> PAGE_SHIFT ? : 1;
-
-restart_scan:
- for (i = preferred; i < eidx; i += incr) {
- unsigned long j;
- i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
- i = ALIGN(i, incr);
- if (i >= eidx)
- break;
- if (test_bit(i, bdata->node_bootmem_map))
- continue;
- for (j = i + 1; j < i + areasize; ++j) {
- if (j >= eidx)
- goto fail_block;
- if (test_bit(j, bdata->node_bootmem_map))
- goto fail_block;
- }
- start = i;
- goto found;
- fail_block:
- i = ALIGN(j, incr);
- }
+ return total_pages;
+}
- if (preferred > offset) {
- preferred = offset;
- goto restart_scan;
- }
- return NULL;
+static void __init __free(bootmem_data_t *bdata,
+ unsigned long sidx, unsigned long eidx)
+{
+ unsigned long idx;
-found:
- bdata->last_success = PFN_PHYS(start);
- BUG_ON(start >= eidx);
+ bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn);
- /*
- * Is the next page of the previous allocation-end the start
- * of this allocation's buffer? If yes then we can 'merge'
- * the previous partial page with this allocation.
- */
- if (align < PAGE_SIZE &&
- bdata->last_offset && bdata->last_pos+1 == start) {
- offset = ALIGN(bdata->last_offset, align);
- BUG_ON(offset > PAGE_SIZE);
- remaining_size = PAGE_SIZE - offset;
- if (size < remaining_size) {
- areasize = 0;
- /* last_pos unchanged */
- bdata->last_offset = offset + size;
- ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset +
- bdata->node_boot_start);
- } else {
- remaining_size = size - remaining_size;
- areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
- ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
- offset +
- bdata->node_boot_start);
- bdata->last_pos = start + areasize - 1;
- bdata->last_offset = remaining_size;
- }
- bdata->last_offset &= ~PAGE_MASK;
- } else {
- bdata->last_pos = start + areasize - 1;
- bdata->last_offset = size & ~PAGE_MASK;
- ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
- }
+ if (bdata->hint_idx > sidx)
+ bdata->hint_idx = sidx;
- /*
- * Reserve the area now:
- */
- for (i = start; i < start + areasize; i++)
- if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+ for (idx = sidx; idx < eidx; idx++)
+ if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
BUG();
- memset(ret, 0, size);
- return ret;
}
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+ unsigned long eidx, int flags)
{
- struct page *page;
- unsigned long pfn;
- bootmem_data_t *bdata = pgdat->bdata;
- unsigned long i, count, total = 0;
unsigned long idx;
- unsigned long *map;
- int gofast = 0;
+ int exclusive = flags & BOOTMEM_EXCLUSIVE;
+
+ bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+ bdata - bootmem_node_data,
+ sidx + bdata->node_min_pfn,
+ eidx + bdata->node_min_pfn,
+ flags);
+
+ for (idx = sidx; idx < eidx; idx++)
+ if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+ if (exclusive) {
+ __free(bdata, sidx, idx);
+ return -EBUSY;
+ }
+ bdebug("silent double reserve of PFN %lx\n",
+ idx + bdata->node_min_pfn);
+ }
+ return 0;
+}
- BUG_ON(!bdata->node_bootmem_map);
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
+ unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long sidx, eidx;
- count = 0;
- /* first extant page of the node */
- pfn = PFN_DOWN(bdata->node_boot_start);
- idx = bdata->node_low_pfn - pfn;
- map = bdata->node_bootmem_map;
- /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
- if (bdata->node_boot_start == 0 ||
- ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
- gofast = 1;
- for (i = 0; i < idx; ) {
- unsigned long v = ~map[i / BITS_PER_LONG];
+ bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+ bdata - bootmem_node_data, start, end, reserve, flags);
- if (gofast && v == ~0UL) {
- int order;
+ BUG_ON(start < bdata->node_min_pfn);
+ BUG_ON(end > bdata->node_low_pfn);
- page = pfn_to_page(pfn);
- count += BITS_PER_LONG;
- order = ffs(BITS_PER_LONG) - 1;
- __free_pages_bootmem(page, order);
- i += BITS_PER_LONG;
- page += BITS_PER_LONG;
- } else if (v) {
- unsigned long m;
-
- page = pfn_to_page(pfn);
- for (m = 1; m && i < idx; m<<=1, page++, i++) {
- if (v & m) {
- count++;
- __free_pages_bootmem(page, 0);
- }
- }
- } else {
- i += BITS_PER_LONG;
+ sidx = start - bdata->node_min_pfn;
+ eidx = end - bdata->node_min_pfn;
+
+ if (reserve)
+ return __reserve(bdata, sidx, eidx, flags);
+ else
+ __free(bdata, sidx, eidx);
+ return 0;
+}
+
+static int __init mark_bootmem(unsigned long start, unsigned long end,
+ int reserve, int flags)
+{
+ unsigned long pos;
+ bootmem_data_t *bdata;
+
+ pos = start;
+ list_for_each_entry(bdata, &bdata_list, list) {
+ int err;
+ unsigned long max;
+
+ if (pos < bdata->node_min_pfn ||
+ pos >= bdata->node_low_pfn) {
+ BUG_ON(pos != start);
+ continue;
}
- pfn += BITS_PER_LONG;
- }
- total += count;
- /*
- * Now free the allocator bitmap itself, it's not
- * needed anymore:
- */
- page = virt_to_page(bdata->node_bootmem_map);
- count = 0;
- idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
- for (i = 0; i < idx; i++, page++) {
- __free_pages_bootmem(page, 0);
- count++;
- }
- total += count;
- bdata->node_bootmem_map = NULL;
+ max = min(bdata->node_low_pfn, end);
+
+ err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+ if (reserve && err) {
+ mark_bootmem(start, pos, 0, 0);
+ return err;
+ }
- return total;
+ if (max == end)
+ return 0;
+ pos = bdata->node_low_pfn;
+ }
+ BUG();
}
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
- unsigned long startpfn, unsigned long endpfn)
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
{
- return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
+ unsigned long start, end;
+
+ kmemleak_free_part(__va(physaddr), size);
+
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
-void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting physical address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
- reserve_bootmem_core(pgdat->bdata, physaddr, size);
+ unsigned long start, end;
+
+ kmemleak_free_part(__va(physaddr), size);
+
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
+
+ mark_bootmem(start, end, 0, 0);
}
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
+/**
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size, int flags)
{
- free_bootmem_core(pgdat->bdata, physaddr, size);
+ unsigned long start, end;
+
+ start = PFN_DOWN(physaddr);
+ end = PFN_UP(physaddr + size);
+
+ return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
}
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+/**
+ * reserve_bootmem - mark a page range as reserved
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+ int flags)
{
- return free_all_bootmem_core(pgdat);
+ unsigned long start, end;
+
+ start = PFN_DOWN(addr);
+ end = PFN_UP(addr + size);
+
+ return mark_bootmem(start, end, 1, flags);
}
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+static unsigned long __init align_idx(struct bootmem_data *bdata,
+ unsigned long idx, unsigned long step)
{
- max_low_pfn = pages;
- min_low_pfn = start;
- return init_bootmem_core(NODE_DATA(0), start, 0, pages);
+ unsigned long base = bdata->node_min_pfn;
+
+ /*
+ * Align the index with respect to the node start so that the
+ * combination of both satisfies the requested alignment.
+ */
+
+ return ALIGN(base + idx, step) - base;
}
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-void __init reserve_bootmem(unsigned long addr, unsigned long size)
+static unsigned long __init align_off(struct bootmem_data *bdata,
+ unsigned long off, unsigned long align)
{
- reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+ unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+
+ /* Same as align_idx for byte offsets */
+
+ return ALIGN(base + off, align) - base;
}
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-void __init free_bootmem(unsigned long addr, unsigned long size)
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
- free_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+ unsigned long fallback = 0;
+ unsigned long min, max, start, sidx, midx, step;
+
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
+ BUG_ON(!size);
+ BUG_ON(align & (align - 1));
+ BUG_ON(limit && goal + size > limit);
+
+ if (!bdata->node_bootmem_map)
+ return NULL;
+
+ min = bdata->node_min_pfn;
+ max = bdata->node_low_pfn;
+
+ goal >>= PAGE_SHIFT;
+ limit >>= PAGE_SHIFT;
+
+ if (limit && max > limit)
+ max = limit;
+ if (max <= min)
+ return NULL;
+
+ step = max(align >> PAGE_SHIFT, 1UL);
+
+ if (goal && min < goal && goal < max)
+ start = ALIGN(goal, step);
+ else
+ start = ALIGN(min, step);
+
+ sidx = start - bdata->node_min_pfn;
+ midx = max - bdata->node_min_pfn;
+
+ if (bdata->hint_idx > sidx) {
+ /*
+ * Handle the valid case of sidx being zero and still
+ * catch the fallback below.
+ */
+ fallback = sidx + 1;
+ sidx = align_idx(bdata, bdata->hint_idx, step);
+ }
+
+ while (1) {
+ int merge;
+ void *region;
+ unsigned long eidx, i, start_off, end_off;
+find_block:
+ sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+ sidx = align_idx(bdata, sidx, step);
+ eidx = sidx + PFN_UP(size);
+
+ if (sidx >= midx || eidx > midx)
+ break;
+
+ for (i = sidx; i < eidx; i++)
+ if (test_bit(i, bdata->node_bootmem_map)) {
+ sidx = align_idx(bdata, i, step);
+ if (sidx == i)
+ sidx += step;
+ goto find_block;
+ }
+
+ if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+ PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+ start_off = align_off(bdata, bdata->last_end_off, align);
+ else
+ start_off = PFN_PHYS(sidx);
+
+ merge = PFN_DOWN(start_off) < sidx;
+ end_off = start_off + size;
+
+ bdata->last_end_off = end_off;
+ bdata->hint_idx = PFN_UP(end_off);
+
+ /*
+ * Reserve the area now:
+ */
+ if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+ PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+ BUG();
+
+ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+ start_off);
+ memset(region, 0, size);
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(region, size, 0, 0);
+ return region;
+ }
+
+ if (fallback) {
+ sidx = align_idx(bdata, fallback - 1, step);
+ fallback = 0;
+ goto find_block;
+ }
+
+ return NULL;
}
-unsigned long __init free_all_bootmem(void)
+static void * __init alloc_bootmem_core(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
{
- return free_all_bootmem_core(NODE_DATA(0));
+ bootmem_data_t *bdata;
+ void *region;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+ list_for_each_entry(bdata, &bdata_list, list) {
+ if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+ continue;
+ if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+ break;
+
+ region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+ if (region)
+ return region;
+ }
+
+ return NULL;
}
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
{
- bootmem_data_t *bdata;
void *ptr;
- list_for_each_entry(bdata, &bdata_list, list) {
- ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
- if (ptr)
- return ptr;
+restart:
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
+ if (goal) {
+ goal = 0;
+ goto restart;
}
+
return NULL;
}
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal)
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = 0;
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
- void *mem = __alloc_bootmem_nopanic(size,align,goal);
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
if (mem)
return mem;
@@ -441,47 +675,186 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return NULL;
}
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = 0;
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+again:
+
+ /* do not panic in alloc_bootmem_bdata() */
+ if (limit && goal + size > limit)
+ limit = 0;
+
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ ptr = alloc_bootmem_core(size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ if (goal) {
+ goal = 0;
+ goto again;
+ }
+
+ return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal,
+ unsigned long limit)
+{
void *ptr;
- ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
if (ptr)
return ptr;
- return __alloc_bootmem(size, align, goal);
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
+ new_goal, 0);
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
}
#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
#endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
- bootmem_data_t *bdata;
- void *ptr;
-
- list_for_each_entry(bdata, &bdata_list, list) {
- ptr = __alloc_bootmem_core(bdata, size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
- if (ptr)
- return ptr;
- }
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
- /*
- * Whoops, we cannot satisfy the allocation request.
- */
- printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of low memory");
- return NULL;
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem_nopanic(size, align, goal,
+ ARCH_LOW_ADDRESS_LIMIT);
}
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 00000000000..d0eac435040
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,409 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache. See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/cleancache.h>
+
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops *cleancache_ops __read_mostly;
+
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured. These are for information only so are not protected
+ * against increment races.
+ */
+static u64 cleancache_succ_gets;
+static u64 cleancache_failed_gets;
+static u64 cleancache_puts;
+static u64 cleancache_invalidates;
+
+/*
+ * When no backend is registered all calls to init_fs and init_shared_fs
+ * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
+ * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
+ * [shared_|]fs_poolid_map) are given to the respective super block
+ * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
+ * registers with cleancache the previous calls to init_fs and init_shared_fs
+ * are executed to create tmem_pools and set the respective poolids. While no
+ * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
+ */
+#define MAX_INITIALIZABLE_FS 32
+#define FAKE_FS_POOLID_OFFSET 1000
+#define FAKE_SHARED_FS_POOLID_OFFSET 2000
+
+#define FS_NO_BACKEND (-1)
+#define FS_UNKNOWN (-2)
+static int fs_poolid_map[MAX_INITIALIZABLE_FS];
+static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
+static char *uuids[MAX_INITIALIZABLE_FS];
+/*
+ * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
+ * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
+ * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
+ */
+static DEFINE_MUTEX(poolid_mutex);
+/*
+ * When set to false (default) all calls to the cleancache functions, except
+ * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
+ * by the if (!cleancache_ops) return. This means multiple threads (from
+ * different filesystems) will be checking cleancache_ops. The usage of a
+ * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
+ * OK if the time between the backend's have been initialized (and
+ * cleancache_ops has been set to not NULL) and when the filesystems start
+ * actually calling the backends. The inverse (when unloading) is obviously
+ * not good - but this shim does not do that (yet).
+ */
+
+/*
+ * The backends and filesystems work all asynchronously. This is b/c the
+ * backends can be built as modules.
+ * The usual sequence of events is:
+ * a) mount / -> __cleancache_init_fs is called. We set the
+ * [shared_|]fs_poolid_map and uuids for.
+ *
+ * b). user does I/Os -> we call the rest of __cleancache_* functions
+ * which return immediately as cleancache_ops is false.
+ *
+ * c). modprobe zcache -> cleancache_register_ops. We init the backend
+ * and set cleancache_ops to true, and for any fs_poolid_map
+ * (which is set by __cleancache_init_fs) we initialize the poolid.
+ *
+ * d). user does I/Os -> now that cleancache_ops is true all the
+ * __cleancache_* functions can call the backend. They all check
+ * that fs_poolid_map is valid and if so invoke the backend.
+ *
+ * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
+ * reset (which is the second check in the __cleancache_* ops
+ * to call the backend).
+ *
+ * The sequence of event could also be c), followed by a), and d). and e). The
+ * c) would not happen anymore. There is also the chance of c), and one thread
+ * doing a) + d), and another doing e). For that case we depend on the
+ * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
+ * that it handles all I/Os before it invalidates the fs (which is last part
+ * of unmounting process).
+ *
+ * Note: The acute reader will notice that there is no "rmmod zcache" case.
+ * This is b/c the functionality for that is not yet implemented and when
+ * done, will require some extra locking not yet devised.
+ */
+
+/*
+ * Register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+{
+ struct cleancache_ops *old = cleancache_ops;
+ int i;
+
+ mutex_lock(&poolid_mutex);
+ for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+ if (fs_poolid_map[i] == FS_NO_BACKEND)
+ fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
+ if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
+ shared_fs_poolid_map[i] = ops->init_shared_fs
+ (uuids[i], PAGE_SIZE);
+ }
+ /*
+ * We MUST set cleancache_ops _after_ we have called the backends
+ * init_fs or init_shared_fs functions. Otherwise the compiler might
+ * re-order where cleancache_ops is set in this function.
+ */
+ barrier();
+ cleancache_ops = ops;
+ mutex_unlock(&poolid_mutex);
+ return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+ int i;
+
+ mutex_lock(&poolid_mutex);
+ for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+ if (fs_poolid_map[i] == FS_UNKNOWN) {
+ sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
+ if (cleancache_ops)
+ fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
+ else
+ fs_poolid_map[i] = FS_NO_BACKEND;
+ break;
+ }
+ }
+ mutex_unlock(&poolid_mutex);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+ int i;
+
+ mutex_lock(&poolid_mutex);
+ for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+ if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
+ sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
+ uuids[i] = uuid;
+ if (cleancache_ops)
+ shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
+ (uuid, PAGE_SIZE);
+ else
+ shared_fs_poolid_map[i] = FS_NO_BACKEND;
+ break;
+ }
+ }
+ mutex_unlock(&poolid_mutex);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+ struct cleancache_filekey *key)
+{
+ int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
+ int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+ struct super_block *sb = inode->i_sb;
+
+ key->u.ino = inode->i_ino;
+ if (sb->s_export_op != NULL) {
+ fhfn = sb->s_export_op->encode_fh;
+ if (fhfn) {
+ len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
+ if (len <= FILEID_ROOT || len == FILEID_INVALID)
+ return -1;
+ if (maxlen > CLEANCACHE_KEY_MAX)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Returns a pool_id that is associated with a given fake poolid.
+ */
+static int get_poolid_from_fake(int fake_pool_id)
+{
+ if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
+ return shared_fs_poolid_map[fake_pool_id -
+ FAKE_SHARED_FS_POOLID_OFFSET];
+ else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
+ return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
+ return FS_NO_BACKEND;
+}
+
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+int __cleancache_get_page(struct page *page)
+{
+ int ret = -1;
+ int pool_id;
+ int fake_pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops) {
+ cleancache_failed_gets++;
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (fake_pool_id < 0)
+ goto out;
+ pool_id = get_poolid_from_fake(fake_pool_id);
+
+ if (cleancache_get_key(page->mapping->host, &key) < 0)
+ goto out;
+
+ if (pool_id >= 0)
+ ret = cleancache_ops->get_page(pool_id,
+ key, page->index, page);
+ if (ret == 0)
+ cleancache_succ_gets++;
+ else
+ cleancache_failed_gets++;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index. Page must be locked. Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_put_page(struct page *page)
+{
+ int pool_id;
+ int fake_pool_id;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops) {
+ cleancache_puts++;
+ return;
+ }
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (fake_pool_id < 0)
+ return;
+
+ pool_id = get_poolid_from_fake(fake_pool_id);
+
+ if (pool_id >= 0 &&
+ cleancache_get_key(page->mapping->host, &key) >= 0) {
+ cleancache_ops->put_page(pool_id, key, page->index, page);
+ cleancache_puts++;
+ }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+
+/*
+ * Invalidate any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_invalidate_page(struct address_space *mapping,
+ struct page *page)
+{
+ /* careful... page->mapping is NULL sometimes when this is called */
+ int pool_id;
+ int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops)
+ return;
+
+ if (fake_pool_id >= 0) {
+ pool_id = get_poolid_from_fake(fake_pool_id);
+ if (pool_id < 0)
+ return;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (cleancache_get_key(mapping->host, &key) >= 0) {
+ cleancache_ops->invalidate_page(pool_id,
+ key, page->index);
+ cleancache_invalidates++;
+ }
+ }
+}
+EXPORT_SYMBOL(__cleancache_invalidate_page);
+
+/*
+ * Invalidate all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
+ */
+void __cleancache_invalidate_inode(struct address_space *mapping)
+{
+ int pool_id;
+ int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ struct cleancache_filekey key = { .u.key = { 0 } };
+
+ if (!cleancache_ops)
+ return;
+
+ if (fake_pool_id < 0)
+ return;
+
+ pool_id = get_poolid_from_fake(fake_pool_id);
+
+ if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+ cleancache_ops->invalidate_inode(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_invalidate_inode);
+
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be returned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs.
+ */
+void __cleancache_invalidate_fs(struct super_block *sb)
+{
+ int index;
+ int fake_pool_id = sb->cleancache_poolid;
+ int old_poolid = fake_pool_id;
+
+ mutex_lock(&poolid_mutex);
+ if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
+ index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
+ old_poolid = shared_fs_poolid_map[index];
+ shared_fs_poolid_map[index] = FS_UNKNOWN;
+ uuids[index] = NULL;
+ } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
+ index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
+ old_poolid = fs_poolid_map[index];
+ fs_poolid_map[index] = FS_UNKNOWN;
+ }
+ sb->cleancache_poolid = -1;
+ if (cleancache_ops)
+ cleancache_ops->invalidate_fs(old_poolid);
+ mutex_unlock(&poolid_mutex);
+}
+EXPORT_SYMBOL(__cleancache_invalidate_fs);
+
+static int __init init_cleancache(void)
+{
+ int i;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("cleancache", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
+ debugfs_create_u64("failed_gets", S_IRUGO,
+ root, &cleancache_failed_gets);
+ debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &cleancache_invalidates);
+#endif
+ for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+ fs_poolid_map[i] = FS_UNKNOWN;
+ shared_fs_poolid_map[i] = FS_UNKNOWN;
+ }
+ return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
new file mode 100644
index 00000000000..21bf292b642
--- /dev/null
+++ b/mm/compaction.c
@@ -0,0 +1,1291 @@
+/*
+ * linux/mm/compaction.c
+ *
+ * Memory compaction for the reduction of external fragmentation. Note that
+ * this heavily depends upon page migration to do all the real heavy
+ * lifting
+ *
+ * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
+ */
+#include <linux/swap.h>
+#include <linux/migrate.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
+#include <linux/backing-dev.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
+#include <linux/page-isolation.h>
+#include "internal.h"
+
+#ifdef CONFIG_COMPACTION
+static inline void count_compact_event(enum vm_event_item item)
+{
+ count_vm_event(item);
+}
+
+static inline void count_compact_events(enum vm_event_item item, long delta)
+{
+ count_vm_events(item, delta);
+}
+#else
+#define count_compact_event(item) do { } while (0)
+#define count_compact_events(item, delta) do { } while (0)
+#endif
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
+static unsigned long release_freepages(struct list_head *freelist)
+{
+ struct page *page, *next;
+ unsigned long count = 0;
+
+ list_for_each_entry_safe(page, next, freelist, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ count++;
+ }
+
+ return count;
+}
+
+static void map_pages(struct list_head *list)
+{
+ struct page *page;
+
+ list_for_each_entry(page, list, lru) {
+ arch_alloc_page(page, 0);
+ kernel_map_pages(page, 1, 1);
+ }
+}
+
+static inline bool migrate_async_suitable(int migratetype)
+{
+ return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
+}
+
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ if (cc->ignore_skip_hint)
+ return true;
+
+ return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long pfn;
+
+ zone->compact_cached_migrate_pfn[0] = start_pfn;
+ zone->compact_cached_migrate_pfn[1] = start_pfn;
+ zone->compact_cached_free_pfn = end_pfn;
+ zone->compact_blockskip_flush = false;
+
+ /* Walk the zone and mark every pageblock as suitable for isolation */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ cond_resched();
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ continue;
+
+ clear_pageblock_skip(page);
+ }
+}
+
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Only flush if a full compaction finished recently */
+ if (zone->compact_blockskip_flush)
+ __reset_isolation_suitable(zone);
+ }
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool set_unsuitable, bool migrate_scanner)
+{
+ struct zone *zone = cc->zone;
+ unsigned long pfn;
+
+ if (cc->ignore_skip_hint)
+ return;
+
+ if (!page)
+ return;
+
+ if (nr_isolated)
+ return;
+
+ /*
+ * Only skip pageblocks when all forms of compaction will be known to
+ * fail in the near future.
+ */
+ if (set_unsuitable)
+ set_pageblock_skip(page);
+
+ pfn = page_to_pfn(page);
+
+ /* Update where async and sync compaction should restart */
+ if (migrate_scanner) {
+ if (cc->finished_update_migrate)
+ return;
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+ } else {
+ if (cc->finished_update_free)
+ return;
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+ }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ return true;
+}
+
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool set_unsuitable, bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
+static inline bool should_release_lock(spinlock_t *lock)
+{
+ return need_resched() || spin_is_contended(lock);
+}
+
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+ bool locked, struct compact_control *cc)
+{
+ if (should_release_lock(lock)) {
+ if (locked) {
+ spin_unlock_irqrestore(lock, *flags);
+ locked = false;
+ }
+
+ /* async aborts if taking too long or contended */
+ if (cc->mode == MIGRATE_ASYNC) {
+ cc->contended = true;
+ return false;
+ }
+
+ cond_resched();
+ }
+
+ if (!locked)
+ spin_lock_irqsave(lock, *flags);
+ return true;
+}
+
+/*
+ * Aside from avoiding lock contention, compaction also periodically checks
+ * need_resched() and either schedules in sync compaction or aborts async
+ * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * is used where no lock is concerned.
+ *
+ * Returns false when no scheduling was needed, or sync compaction scheduled.
+ * Returns true when async compaction should abort.
+ */
+static inline bool compact_should_abort(struct compact_control *cc)
+{
+ /* async compaction aborts if contended */
+ if (need_resched()) {
+ if (cc->mode == MIGRATE_ASYNC) {
+ cc->contended = true;
+ return true;
+ }
+
+ cond_resched();
+ }
+
+ return false;
+}
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ /* If the page is a large free page, then disallow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return false;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+/*
+ * Isolate free pages onto a private freelist. If @strict is true, will abort
+ * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
+ * (even though it may still end up isolating some pages).
+ */
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+ unsigned long blockpfn,
+ unsigned long end_pfn,
+ struct list_head *freelist,
+ bool strict)
+{
+ int nr_scanned = 0, total_isolated = 0;
+ struct page *cursor, *valid_page = NULL;
+ unsigned long flags;
+ bool locked = false;
+ bool checked_pageblock = false;
+
+ cursor = pfn_to_page(blockpfn);
+
+ /* Isolate free pages. */
+ for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ int isolated, i;
+ struct page *page = cursor;
+
+ nr_scanned++;
+ if (!pfn_valid_within(blockpfn))
+ goto isolate_fail;
+
+ if (!valid_page)
+ valid_page = page;
+ if (!PageBuddy(page))
+ goto isolate_fail;
+
+ /*
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
+ */
+ locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+ locked, cc);
+ if (!locked)
+ break;
+
+ /* Recheck this is a suitable migration target under lock */
+ if (!strict && !checked_pageblock) {
+ /*
+ * We need to check suitability of pageblock only once
+ * and this isolate_freepages_block() is called with
+ * pageblock range, so just check once is sufficient.
+ */
+ checked_pageblock = true;
+ if (!suitable_migration_target(page))
+ break;
+ }
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+ goto isolate_fail;
+
+ /* Found a free page, break it into order-0 pages */
+ isolated = split_free_page(page);
+ total_isolated += isolated;
+ for (i = 0; i < isolated; i++) {
+ list_add(&page->lru, freelist);
+ page++;
+ }
+
+ /* If a page was split, advance to the end of it */
+ if (isolated) {
+ blockpfn += isolated - 1;
+ cursor += isolated - 1;
+ continue;
+ }
+
+isolate_fail:
+ if (strict)
+ break;
+ else
+ continue;
+
+ }
+
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+ /*
+ * If strict isolation is requested by CMA then check that all the
+ * pages requested were isolated. If there were any failures, 0 is
+ * returned and CMA will fail.
+ */
+ if (strict && blockpfn < end_pfn)
+ total_isolated = 0;
+
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (blockpfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, total_isolated, true,
+ false);
+
+ count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_compact_events(COMPACTISOLATED, total_isolated);
+ return total_isolated;
+}
+
+/**
+ * isolate_freepages_range() - isolate free pages.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Non-free pages, invalid PFNs, or zone boundaries within the
+ * [start_pfn, end_pfn) range are considered errors, cause function to
+ * undo its actions and return zero.
+ *
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater then end_pfn if end fell in a middle of
+ * a free page).
+ */
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long isolated, pfn, block_end_pfn;
+ LIST_HEAD(freelist);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
+ if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
+ break;
+
+ /*
+ * On subsequent iterations ALIGN() is actually not needed,
+ * but we keep it that we not to complicate the code.
+ */
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
+ &freelist, true);
+
+ /*
+ * In strict mode, isolate_freepages_block() returns 0 if
+ * there are any holes in the block (ie. invalid PFNs or
+ * non-free pages).
+ */
+ if (!isolated)
+ break;
+
+ /*
+ * If we managed to isolate pages, it is always (1 << n) *
+ * pageblock_nr_pages for some non-negative n. (Max order
+ * page may span two pageblocks).
+ */
+ }
+
+ /* split_free_page does not map the pages */
+ map_pages(&freelist);
+
+ if (pfn < end_pfn) {
+ /* Loop terminated early, cleanup. */
+ release_freepages(&freelist);
+ return 0;
+ }
+
+ /* We don't use freelists for anything. */
+ return pfn;
+}
+
+/* Update the number of anon and file isolated pages in the zone */
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
+{
+ struct page *page;
+ unsigned int count[2] = { 0, };
+
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;
+
+ /* If locked we can use the interrupt unsafe versions */
+ if (locked) {
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ } else {
+ mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ }
+}
+
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(struct zone *zone)
+{
+ unsigned long active, inactive, isolated;
+
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
+
+ return isolated > (inactive + active) / 2;
+}
+
+/**
+ * isolate_migratepages_range() - isolate all migrate-able pages in range.
+ * @zone: Zone pages are in.
+ * @cc: Compaction control structure.
+ * @low_pfn: The first PFN of the range.
+ * @end_pfn: The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
+ *
+ * Isolate all pages that can be migrated from the range specified by
+ * [low_pfn, end_pfn). Returns zero if there is a fatal signal
+ * pending), otherwise PFN of the first page that was not scanned
+ * (which may be both less, equal to or more then end_pfn).
+ *
+ * Assumes that cc->migratepages is empty and cc->nr_migratepages is
+ * zero.
+ *
+ * Apart from cc->migratepages and cc->nr_migratetypes this function
+ * does not modify any cc's fields, in particular it does not modify
+ * (or read for that matter) cc->migrate_pfn.
+ */
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
+{
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
+ struct list_head *migratelist = &cc->migratepages;
+ struct lruvec *lruvec;
+ unsigned long flags;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
+ bool set_unsuitable = true;
+ const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
+ ISOLATE_ASYNC_MIGRATE : 0) |
+ (unevictable ? ISOLATE_UNEVICTABLE : 0);
+
+ /*
+ * Ensure that there are not too many pages isolated from the LRU
+ * list by either parallel reclaimers or compaction. If there are,
+ * delay for some time until fewer pages are isolated
+ */
+ while (unlikely(too_many_isolated(zone))) {
+ /* async migration should just abort */
+ if (cc->mode == MIGRATE_ASYNC)
+ return 0;
+
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ if (fatal_signal_pending(current))
+ return 0;
+ }
+
+ if (compact_should_abort(cc))
+ return 0;
+
+ /* Time to isolate some pages for migration */
+ for (; low_pfn < end_pfn; low_pfn++) {
+ /* give a chance to irqs before checking need_resched() */
+ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (should_release_lock(&zone->lru_lock)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
+ }
+
+ /*
+ * migrate_pfn does not necessarily start aligned to a
+ * pageblock. Ensure that pfn_valid is called when moving
+ * into a new MAX_ORDER_NR_PAGES range in case of large
+ * memory holes within the zone
+ */
+ if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if (!pfn_valid(low_pfn)) {
+ low_pfn += MAX_ORDER_NR_PAGES - 1;
+ continue;
+ }
+ }
+
+ if (!pfn_valid_within(low_pfn))
+ continue;
+ nr_scanned++;
+
+ /*
+ * Get the page and ensure the page is within the same zone.
+ * See the comment in isolate_freepages about overlapping
+ * nodes. It is deliberate that the new zone lock is not taken
+ * as memory compaction should not move pages between nodes.
+ */
+ page = pfn_to_page(low_pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ if (!valid_page)
+ valid_page = page;
+
+ /* If isolation recently failed, do not retry */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (last_pageblock_nr != pageblock_nr) {
+ int mt;
+
+ last_pageblock_nr = pageblock_nr;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /*
+ * For async migration, also only scan in MOVABLE
+ * blocks. Async migration is optimistic to see if
+ * the minimum amount of work satisfies the allocation
+ */
+ mt = get_pageblock_migratetype(page);
+ if (cc->mode == MIGRATE_ASYNC &&
+ !migrate_async_suitable(mt)) {
+ set_unsuitable = false;
+ goto next_pageblock;
+ }
+ }
+
+ /*
+ * Skip if free. page_order cannot be used without zone->lock
+ * as nothing prevents parallel allocations or buddy merging.
+ */
+ if (PageBuddy(page))
+ continue;
+
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU pages and balloon pages
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ if (unlikely(balloon_page_movable(page))) {
+ if (locked && balloon_page_isolate(page)) {
+ /* Successfully isolated */
+ goto isolate_success;
+ }
+ }
+ continue;
+ }
+
+ /*
+ * PageLRU is set. lru_lock normally excludes isolation
+ * splitting and collapsing (collapsing has already happened
+ * if PageLRU is set) but the lock is not necessarily taken
+ * here and it is wasteful to take it just to check transhuge.
+ * Check TransHuge without lock and skip the whole pageblock if
+ * it's either a transhuge or hugetlbfs page, as calling
+ * compound_order() without preventing THP from splitting the
+ * page underneath us may return surprising results.
+ */
+ if (PageTransHuge(page)) {
+ if (!locked)
+ goto next_pageblock;
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * Migration will fail if an anonymous page is pinned in memory,
+ * so avoid taking lru_lock and isolating it unnecessarily in an
+ * admittedly racy check.
+ */
+ if (!page_mapping(page) &&
+ page_count(page) > page_mapcount(page))
+ continue;
+
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
+ if (!locked || fatal_signal_pending(current))
+ break;
+
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ /* Try isolate the page */
+ if (__isolate_lru_page(page, mode) != 0)
+ continue;
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ /* Successfully isolated */
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+
+isolate_success:
+ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+
+ /* Avoid isolating too much */
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ ++low_pfn;
+ break;
+ }
+
+ continue;
+
+next_pageblock:
+ low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
+ }
+
+ acct_isolated(zone, locked, cc);
+
+ if (locked)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+ /*
+ * Update the pageblock-skip information and cached scanner pfn,
+ * if the whole pageblock was scanned without isolating any page.
+ */
+ if (low_pfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, nr_isolated,
+ set_unsuitable, true);
+
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
+ count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+
+ return low_pfn;
+}
+
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+#ifdef CONFIG_COMPACTION
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ struct page *page;
+ unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long block_end_pfn; /* end of current pageblock */
+ unsigned long low_pfn; /* lowest pfn scanner is able to scan */
+ int nr_freepages = cc->nr_freepages;
+ struct list_head *freelist = &cc->freepages;
+
+ /*
+ * Initialise the free scanner. The starting point is where we last
+ * successfully isolated from, zone-cached value, or the end of the
+ * zone when isolating for the first time. We need this aligned to
+ * the pageblock boundary, because we do
+ * block_start_pfn -= pageblock_nr_pages in the for loop.
+ * For ending point, take care when isolating in last pageblock of a
+ * a zone which ends in the middle of a pageblock.
+ * The low boundary is the end of the pageblock the migration scanner
+ * is using.
+ */
+ block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+ block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
+ zone_end_pfn(zone));
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+
+ /*
+ * Isolate free pages until enough are available to migrate the
+ * pages on cc->migratepages. We stop searching if the migrate
+ * and free page scanners meet or enough free pages are isolated.
+ */
+ for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ block_end_pfn = block_start_pfn,
+ block_start_pfn -= pageblock_nr_pages) {
+ unsigned long isolated;
+
+ /*
+ * This can iterate a massively long zone without finding any
+ * suitable migration targets, so periodically check if we need
+ * to schedule, or even abort async compaction.
+ */
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+ && compact_should_abort(cc))
+ break;
+
+ if (!pfn_valid(block_start_pfn))
+ continue;
+
+ /*
+ * Check for overlapping nodes/zones. It's possible on some
+ * configurations to have a setup like
+ * node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of
+ * pages do not belong to a single zone.
+ */
+ page = pfn_to_page(block_start_pfn);
+ if (page_zone(page) != zone)
+ continue;
+
+ /* Check the block is suitable for migration */
+ if (!suitable_migration_target(page))
+ continue;
+
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
+ cc->free_pfn = block_start_pfn;
+ isolated = isolate_freepages_block(cc, block_start_pfn,
+ block_end_pfn, freelist, false);
+ nr_freepages += isolated;
+
+ /*
+ * Set a flag that we successfully isolated in this pageblock.
+ * In the next loop iteration, zone->compact_cached_free_pfn
+ * will not be updated and thus it will effectively contain the
+ * highest pageblock we isolated pages from.
+ */
+ if (isolated)
+ cc->finished_update_free = true;
+
+ /*
+ * isolate_freepages_block() might have aborted due to async
+ * compaction being contended
+ */
+ if (cc->contended)
+ break;
+ }
+
+ /* split_free_page does not map the pages */
+ map_pages(freelist);
+
+ /*
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+ if (block_start_pfn < low_pfn)
+ cc->free_pfn = cc->migrate_pfn;
+
+ cc->nr_freepages = nr_freepages;
+}
+
+/*
+ * This is a migrate-callback that "allocates" freepages by taking pages
+ * from the isolated freelists in the block we are migrating to.
+ */
+static struct page *compaction_alloc(struct page *migratepage,
+ unsigned long data,
+ int **result)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+ struct page *freepage;
+
+ /*
+ * Isolate free pages if necessary, and if we are not aborting due to
+ * contention.
+ */
+ if (list_empty(&cc->freepages)) {
+ if (!cc->contended)
+ isolate_freepages(cc->zone, cc);
+
+ if (list_empty(&cc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(cc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ cc->nr_freepages--;
+
+ return freepage;
+}
+
+/*
+ * This is a migrate-callback that "frees" freepages back to the isolated
+ * freelist. All pages on the freelist are from the same zone, so there is no
+ * special handling needed for NUMA.
+ */
+static void compaction_free(struct page *page, unsigned long data)
+{
+ struct compact_control *cc = (struct compact_control *)data;
+
+ list_add(&page->lru, &cc->freepages);
+ cc->nr_freepages++;
+}
+
+/* possible outcome of isolate_migratepages */
+typedef enum {
+ ISOLATE_ABORT, /* Abort compaction now */
+ ISOLATE_NONE, /* No pages isolated, continue scanning */
+ ISOLATE_SUCCESS, /* Pages isolated, migrate */
+} isolate_migrate_t;
+
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned long low_pfn, end_pfn;
+
+ /* Do not scan outside zone boundaries */
+ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+
+ /* Only scan within a pageblock boundary */
+ end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+
+ /* Do not cross the free scanner or scan within a memory hole */
+ if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+ cc->migrate_pfn = end_pfn;
+ return ISOLATE_NONE;
+ }
+
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
+ if (!low_pfn || cc->contended)
+ return ISOLATE_ABORT;
+
+ cc->migrate_pfn = low_pfn;
+
+ return ISOLATE_SUCCESS;
+}
+
+static int compact_finished(struct zone *zone,
+ struct compact_control *cc)
+{
+ unsigned int order;
+ unsigned long watermark;
+
+ if (cc->contended || fatal_signal_pending(current))
+ return COMPACT_PARTIAL;
+
+ /* Compaction run completes if the migrate and free scanner meet */
+ if (cc->free_pfn <= cc->migrate_pfn) {
+ /* Let the next compaction start anew. */
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kswapd does not set the
+ * flag itself as the decision to be clear should be directly
+ * based on an allocation request.
+ */
+ if (!current_is_kswapd())
+ zone->compact_blockskip_flush = true;
+
+ return COMPACT_COMPLETE;
+ }
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (cc->order == -1)
+ return COMPACT_CONTINUE;
+
+ /* Compaction run is not finished if the watermark is not met */
+ watermark = low_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
+ if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+ return COMPACT_CONTINUE;
+
+ /* Direct compactor: Is a suitable page free? */
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[order];
+
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
+ return COMPACT_PARTIAL;
+ }
+
+ return COMPACT_CONTINUE;
+}
+
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1000 implies allocations might succeed depending on
+ * watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+ 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
+static int compact_zone(struct zone *zone, struct compact_control *cc)
+{
+ int ret;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ const bool sync = cc->mode != MIGRATE_ASYNC;
+
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ /*
+ * Setup to move all movable pages to the end of the zone. Used cached
+ * information on where the scanners should start but check that it
+ * is initialised by ensuring the values are within zone boundaries.
+ */
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ }
+
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+
+ migrate_prep_local();
+
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ int err;
+
+ switch (isolate_migratepages(zone, cc)) {
+ case ISOLATE_ABORT:
+ ret = COMPACT_PARTIAL;
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ goto out;
+ case ISOLATE_NONE:
+ continue;
+ case ISOLATE_SUCCESS:
+ ;
+ }
+
+ if (!cc->nr_migratepages)
+ continue;
+
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
+ compaction_free, (unsigned long)cc, cc->mode,
+ MR_COMPACTION);
+
+ trace_mm_compaction_migratepages(cc->nr_migratepages, err,
+ &cc->migratepages);
+
+ /* All pages were either migrated or will be released */
+ cc->nr_migratepages = 0;
+ if (err) {
+ putback_movable_pages(&cc->migratepages);
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
+ }
+ }
+
+out:
+ /* Release free pages and check accounting */
+ cc->nr_freepages -= release_freepages(&cc->freepages);
+ VM_BUG_ON(cc->nr_freepages != 0);
+
+ trace_mm_compaction_end(ret);
+
+ return ret;
+}
+
+static unsigned long compact_zone_order(struct zone *zone, int order,
+ gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+{
+ unsigned long ret;
+ struct compact_control cc = {
+ .nr_freepages = 0,
+ .nr_migratepages = 0,
+ .order = order,
+ .migratetype = allocflags_to_migratetype(gfp_mask),
+ .zone = zone,
+ .mode = mode,
+ };
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ ret = compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+
+ *contended = cc.contended;
+ return ret;
+}
+
+int sysctl_extfrag_threshold = 500;
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @zonelist: The zonelist used for the current allocation
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+ * @mode: The migration mode for async, sync light, or sync migration
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
+ *
+ * This is the main entry point for direct page compaction.
+ */
+unsigned long try_to_compact_pages(struct zonelist *zonelist,
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ enum migrate_mode mode, bool *contended)
+{
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ int may_enter_fs = gfp_mask & __GFP_FS;
+ int may_perform_io = gfp_mask & __GFP_IO;
+ struct zoneref *z;
+ struct zone *zone;
+ int rc = COMPACT_SKIPPED;
+ int alloc_flags = 0;
+
+ /* Check if the GFP flags allow compaction */
+ if (!order || !may_enter_fs || !may_perform_io)
+ return rc;
+
+ count_compact_event(COMPACTSTALL);
+
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
+ /* Compact each zone in the list */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ nodemask) {
+ int status;
+
+ status = compact_zone_order(zone, order, gfp_mask, mode,
+ contended);
+ rc = max(status, rc);
+
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+ alloc_flags))
+ break;
+ }
+
+ return rc;
+}
+
+
+/* Compact all zones within a node */
+static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
+{
+ int zoneid;
+ struct zone *zone;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc->nr_freepages = 0;
+ cc->nr_migratepages = 0;
+ cc->zone = zone;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
+ if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+ compact_zone(zone, cc);
+
+ if (cc->order > 0) {
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
+ }
+
+ VM_BUG_ON(!list_empty(&cc->freepages));
+ VM_BUG_ON(!list_empty(&cc->migratepages));
+ }
+}
+
+void compact_pgdat(pg_data_t *pgdat, int order)
+{
+ struct compact_control cc = {
+ .order = order,
+ .mode = MIGRATE_ASYNC,
+ };
+
+ if (!order)
+ return;
+
+ __compact_pgdat(pgdat, &cc);
+}
+
+static void compact_node(int nid)
+{
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ };
+
+ __compact_pgdat(NODE_DATA(nid), &cc);
+}
+
+/* Compact all nodes in the system */
+static void compact_nodes(void)
+{
+ int nid;
+
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ for_each_online_node(nid)
+ compact_node(nid);
+}
+
+/* The written value is actually unused, all memory is compacted */
+int sysctl_compact_memory;
+
+/* This is the entry point for compacting all nodes via /proc/sys/vm */
+int sysctl_compaction_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ if (write)
+ compact_nodes();
+
+ return 0;
+}
+
+int sysctl_extfrag_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ return 0;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t sysfs_compact_node(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+
+ if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
+ /* Flush pending updates to the LRU lists */
+ lru_add_drain_all();
+
+ compact_node(nid);
+ }
+
+ return count;
+}
+static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+
+int compaction_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_compact);
+}
+
+void compaction_unregister_node(struct node *node)
+{
+ return device_remove_file(&node->dev, &dev_attr_compact);
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
+#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 00000000000..789ff70c8a4
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,102 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+
+static inline void set_page_poison(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+ return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ set_page_poison(page);
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ printk(KERN_ERR "pagealloc: single bit error\n");
+ else
+ printk(KERN_ERR "pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ if (!page_poison(page))
+ return;
+
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 00000000000..306baa594f9
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,505 @@
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 as published by the
+ * Free Software Foundation.
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device. It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple. The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages. Each page in the page_list is split into blocks of at
+ * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page. Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
+
+struct dma_pool { /* the pool */
+ struct list_head page_list;
+ spinlock_t lock;
+ size_t size;
+ struct device *dev;
+ size_t allocation;
+ size_t boundary;
+ char name[32];
+ struct list_head pools;
+};
+
+struct dma_page { /* cacheable header for 'allocation' bytes */
+ struct list_head page_list;
+ void *vaddr;
+ dma_addr_t dma;
+ unsigned int in_use;
+ unsigned int offset;
+};
+
+static DEFINE_MUTEX(pools_lock);
+
+static ssize_t
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ unsigned temp;
+ unsigned size;
+ char *next;
+ struct dma_page *page;
+ struct dma_pool *pool;
+
+ next = buf;
+ size = PAGE_SIZE;
+
+ temp = scnprintf(next, size, "poolinfo - 0.1\n");
+ size -= temp;
+ next += temp;
+
+ mutex_lock(&pools_lock);
+ list_for_each_entry(pool, &dev->dma_pools, pools) {
+ unsigned pages = 0;
+ unsigned blocks = 0;
+
+ spin_lock_irq(&pool->lock);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ pages++;
+ blocks += page->in_use;
+ }
+ spin_unlock_irq(&pool->lock);
+
+ /* per-pool info, no real statistics yet */
+ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+ pool->name, blocks,
+ pages * (pool->allocation / pool->size),
+ pool->size, pages);
+ size -= temp;
+ next += temp;
+ }
+ mutex_unlock(&pools_lock);
+
+ return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
+
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @boundary: returned blocks won't cross this power of two boundary
+ * Context: !in_interrupt()
+ *
+ * Returns a dma allocation pool with the requested characteristics, or
+ * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory. Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives. The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary. This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ */
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary)
+{
+ struct dma_pool *retval;
+ size_t allocation;
+
+ if (align == 0) {
+ align = 1;
+ } else if (align & (align - 1)) {
+ return NULL;
+ }
+
+ if (size == 0) {
+ return NULL;
+ } else if (size < 4) {
+ size = 4;
+ }
+
+ if ((size % align) != 0)
+ size = ALIGN(size, align);
+
+ allocation = max_t(size_t, size, PAGE_SIZE);
+
+ if (!boundary) {
+ boundary = allocation;
+ } else if ((boundary < size) || (boundary & (boundary - 1))) {
+ return NULL;
+ }
+
+ retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ if (!retval)
+ return retval;
+
+ strlcpy(retval->name, name, sizeof(retval->name));
+
+ retval->dev = dev;
+
+ INIT_LIST_HEAD(&retval->page_list);
+ spin_lock_init(&retval->lock);
+ retval->size = size;
+ retval->boundary = boundary;
+ retval->allocation = allocation;
+
+ INIT_LIST_HEAD(&retval->pools);
+
+ mutex_lock(&pools_lock);
+ if (list_empty(&dev->dma_pools) &&
+ device_create_file(dev, &dev_attr_pools)) {
+ kfree(retval);
+ return NULL;
+ } else
+ list_add(&retval->pools, &dev->dma_pools);
+ mutex_unlock(&pools_lock);
+
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_create);
+
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+ unsigned int offset = 0;
+ unsigned int next_boundary = pool->boundary;
+
+ do {
+ unsigned int next = offset + pool->size;
+ if (unlikely((next + pool->size) >= next_boundary)) {
+ next = next_boundary;
+ next_boundary += pool->boundary;
+ }
+ *(int *)(page->vaddr + offset) = next;
+ offset = next;
+ } while (offset < pool->allocation);
+}
+
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
+{
+ struct dma_page *page;
+
+ page = kmalloc(sizeof(*page), mem_flags);
+ if (!page)
+ return NULL;
+ page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
+ &page->dma, mem_flags);
+ if (page->vaddr) {
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ pool_initialise_page(pool, page);
+ page->in_use = 0;
+ page->offset = 0;
+ } else {
+ kfree(page);
+ page = NULL;
+ }
+ return page;
+}
+
+static inline int is_page_busy(struct dma_page *page)
+{
+ return page->in_use != 0;
+}
+
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+{
+ dma_addr_t dma = page->dma;
+
+#ifdef DMAPOOL_DEBUG
+ memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+ dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+ list_del(&page->page_list);
+ kfree(page);
+}
+
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void dma_pool_destroy(struct dma_pool *pool)
+{
+ mutex_lock(&pools_lock);
+ list_del(&pool->pools);
+ if (pool->dev && list_empty(&pool->dev->dma_pools))
+ device_remove_file(pool->dev, &dev_attr_pools);
+ mutex_unlock(&pools_lock);
+
+ while (!list_empty(&pool->page_list)) {
+ struct dma_page *page;
+ page = list_entry(pool->page_list.next,
+ struct dma_page, page_list);
+ if (is_page_busy(page)) {
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ else
+ printk(KERN_ERR
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, page->vaddr);
+ /* leak the still-in-use consistent memory */
+ list_del(&page->page_list);
+ kfree(page);
+ } else
+ pool_free_page(pool, page);
+ }
+
+ kfree(pool);
+}
+EXPORT_SYMBOL(dma_pool_destroy);
+
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * This returns the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, %NULL is returned.
+ */
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ unsigned long flags;
+ struct dma_page *page;
+ size_t offset;
+ void *retval;
+
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (page->offset < pool->allocation)
+ goto ready;
+ }
+
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
+
+ spin_lock_irqsave(&pool->lock, flags);
+
+ list_add(&page->page_list, &pool->page_list);
+ ready:
+ page->in_use++;
+ offset = page->offset;
+ page->offset = *(int *)(page->vaddr + offset);
+ retval = offset + page->vaddr;
+ *handle = offset + page->dma;
+#ifdef DMAPOOL_DEBUG
+ {
+ int i;
+ u8 *data = retval;
+ /* page->offset is stored in first 4 bytes */
+ for (i = sizeof(page->offset); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_alloc %s, %p (corrupted)\n",
+ pool->name, retval);
+ else
+ pr_err("dma_pool_alloc %s, %p (corrupted)\n",
+ pool->name, retval);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+ }
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return retval;
+}
+EXPORT_SYMBOL(dma_pool_alloc);
+
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct dma_page *page;
+
+ list_for_each_entry(page, &pool->page_list, page_list) {
+ if (dma < page->dma)
+ continue;
+ if (dma < (page->dma + pool->allocation))
+ return page;
+ }
+ return NULL;
+}
+
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+ struct dma_page *page;
+ unsigned long flags;
+ unsigned int offset;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ page = pool_find_page(pool, dma);
+ if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pool->name, vaddr, (unsigned long)dma);
+ return;
+ }
+
+ offset = vaddr - page->vaddr;
+#ifdef DMAPOOL_DEBUG
+ if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ else
+ printk(KERN_ERR
+ "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pool->name, vaddr, (unsigned long long)dma);
+ return;
+ }
+ {
+ unsigned int chain = page->offset;
+ while (chain < pool->allocation) {
+ if (chain != offset) {
+ chain = *(int *)(page->vaddr + chain);
+ continue;
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ if (pool->dev)
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ else
+ printk(KERN_ERR "dma_pool_free %s, dma %Lx "
+ "already free\n", pool->name,
+ (unsigned long long)dma);
+ return;
+ }
+ }
+ memset(vaddr, POOL_POISON_FREED, pool->size);
+#endif
+
+ page->in_use--;
+ *(int *)vaddr = page->offset;
+ page->offset = offset;
+ /*
+ * Resist a temptation to do
+ * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * Better have a few empty pages hang around.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+}
+EXPORT_SYMBOL(dma_pool_free);
+
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+ struct dma_pool *pool = *(struct dma_pool **)res;
+
+ dma_pool_destroy(pool);
+}
+
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+ return *(struct dma_pool **)res == match_data;
+}
+
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create(). DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+ size_t size, size_t align, size_t allocation)
+{
+ struct dma_pool **ptr, *pool;
+
+ ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+ if (pool)
+ devres_add(dev, ptr);
+ else
+ devres_free(ptr);
+
+ return pool;
+}
+EXPORT_SYMBOL(dmam_pool_create);
+
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+ struct device *dev = pool->dev;
+
+ WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool));
+}
+EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 00000000000..e10ccd299d6
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static int after_paging_init __initdata;
+
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+
+void __init early_ioremap_reset(void)
+{
+ early_ioremap_shutdown();
+ after_paging_init = 1;
+}
+
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t prot)
+{
+ BUG();
+}
+#endif
+
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+ BUG();
+}
+#endif
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_setup(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (WARN_ON(prev_map[i]))
+ break;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+
+static int __init check_early_ioremap_leak(void)
+{
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (WARN(count, KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n"
+ "please boot with early_ioremap_debug and report the dmesg.\n",
+ count))
+ return 1;
+ return 0;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+ unsigned long offset;
+ resource_size_t last_addr;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
+ __func__, (u64)phys_addr, size))
+ return NULL;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (WARN_ON(!size || last_addr < phys_addr))
+ return NULL;
+
+ prev_size[slot] = size;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+ return NULL;
+
+ /*
+ * Ok, go for it..
+ */
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_set_fixmap(idx, phys_addr, prot);
+ else
+ __early_set_fixmap(idx, phys_addr, prot);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+ return prev_map[slot];
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size))
+ return;
+
+ if (WARN(prev_size[slot] != size,
+ "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]))
+ return;
+
+ WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+ addr, size, slot);
+
+ virt_addr = (unsigned long)addr;
+ if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+ return;
+
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+ --idx;
+ --nrpages;
+ }
+ prev_map[slot] = NULL;
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+}
+#else /* CONFIG_MMU */
+
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void __iomem *)phys_addr;
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+
+#endif /* CONFIG_MMU */
+
+
+void __init early_memunmap(void *addr, unsigned long size)
+{
+ early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 168c78a121b..3bcfd81db45 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2002, Linus Torvalds
*
- * 11Jan2003 akpm@digeo.com
+ * 11Jan2003 Andrew Morton
* Initial version.
*/
@@ -17,6 +17,7 @@
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
+#include <linux/swap.h>
#include <asm/unistd.h>
@@ -24,9 +25,9 @@
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
-asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
- struct file *file = fget(fd);
+ struct fd f = fdget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -35,23 +36,35 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
unsigned long nrpages;
int ret = 0;
- if (!file)
+ if (!f.file)
return -EBADF;
- if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+ if (S_ISFIFO(file_inode(f.file)->i_mode)) {
ret = -ESPIPE;
goto out;
}
- mapping = file->f_mapping;
+ mapping = f.file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
goto out;
}
- if (mapping->a_ops->get_xip_page)
- /* no bad return value, but ignore advice */
+ if (mapping->a_ops->get_xip_mem) {
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_NOREUSE:
+ case POSIX_FADV_DONTNEED:
+ /* no bad return value, but ignore advice */
+ break;
+ default:
+ ret = -EINVAL;
+ }
goto out;
+ }
/* Careful about overflows. Len == 0 means "as much as possible" */
endbyte = offset + len;
@@ -64,20 +77,23 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
switch (advice) {
case POSIX_FADV_NORMAL:
- file->f_ra.ra_pages = bdi->ra_pages;
+ f.file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_RANDOM:
- file->f_ra.ra_pages = 0;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
- file->f_ra.ra_pages = bdi->ra_pages * 2;
+ f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_WILLNEED:
- if (!mapping->a_ops->readpage) {
- ret = -EINVAL;
- break;
- }
-
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -86,38 +102,53 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
-
- ret = force_page_cache_readahead(mapping, file,
- start_index,
- max_sane_readahead(nrpages));
- if (ret > 0)
- ret = 0;
+
+ /*
+ * Ignore return value because fadvise() shall return
+ * success even if filesystem can't retrieve a hint,
+ */
+ force_page_cache_readahead(mapping, f.file, start_index,
+ nrpages);
break;
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
if (!bdi_write_congested(mapping->backing_dev_info))
- filemap_flush(mapping);
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
/* First and last FULL page! */
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
- if (end_index >= start_index)
- invalidate_mapping_pages(mapping, start_index,
+ if (end_index >= start_index) {
+ unsigned long count = invalidate_mapping_pages(mapping,
+ start_index, end_index);
+
+ /*
+ * If fewer pages were invalidated than expected then
+ * it is possible that some of the pages were on
+ * a per-cpu pagevec for a remote CPU. Drain all
+ * pagevecs and try again.
+ */
+ if (count < (end_index - start_index + 1)) {
+ lru_add_drain_all();
+ invalidate_mapping_pages(mapping, start_index,
end_index);
+ }
+ }
break;
default:
ret = -EINVAL;
}
out:
- fput(file);
+ fdput(f);
return ret;
}
#ifdef __ARCH_WANT_SYS_FADVISE64
-asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
+SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
{
return sys_fadvise64_64(fd, offset, len, advice);
}
diff --git a/mm/failslab.c b/mm/failslab.c
new file mode 100644
index 00000000000..fefaabaab76
--- /dev/null
+++ b/mm/failslab.c
@@ -0,0 +1,60 @@
+#include <linux/fault-inject.h>
+#include <linux/slab.h>
+
+static struct {
+ struct fault_attr attr;
+ u32 ignore_gfp_wait;
+ int cache_filter;
+} failslab = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .cache_filter = 0,
+};
+
+bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+{
+ if (gfpflags & __GFP_NOFAIL)
+ return false;
+
+ if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+ return false;
+
+ if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ return false;
+
+ return should_fail(&failslab.attr, size);
+}
+
+static int __init setup_failslab(char *str)
+{
+ return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+static int __init failslab_debugfs_init(void)
+{
+ struct dentry *dir;
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter))
+ goto fail;
+
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
+
+ return -ENOMEM;
+}
+
+late_initcall(failslab_debugfs_init);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/filemap.c b/mm/filemap.c
index afcdc72b5e9..900edfaf6df 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,14 +9,14 @@
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
*/
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/mman.h>
@@ -25,25 +25,27 @@
#include <linux/uio.h>
#include <linux/hash.h>
#include <linux/writeback.h>
+#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
-#include <linux/syscalls.h>
#include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
+#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/filemap.h>
+
/*
* FIXME: remove all knowledge of the buffer layer from the core VM
*/
-#include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/buffer_head.h> /* for try_to_free_buffers */
#include <asm/mman.h>
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs);
-
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
* though.
@@ -59,33 +61,30 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
/*
* Lock ordering:
*
- * ->i_mmap_lock (vmtruncate)
+ * ->i_mmap_mutex (truncate_pagecache)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
* ->i_mutex
- * ->i_mmap_lock (truncate->unmap_mapping_range)
+ * ->i_mmap_mutex (truncate->unmap_mapping_range)
*
* ->mmap_sem
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
* ->lock_page (access_process_vm)
*
- * ->mmap_sem
- * ->i_mutex (msync)
- *
- * ->i_mutex
- * ->i_alloc_sem (various)
+ * ->i_mutex (generic_perform_write)
+ * ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * bdi->wb.list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
- * ->i_mmap_lock
+ * ->i_mmap_mutex
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
@@ -99,76 +98,174 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
+ * ->i_mmap_mutex
+ * ->tasklist_lock (memory_failure, collect_procs_ao)
*/
+static void page_cache_tree_delete(struct address_space *mapping,
+ struct page *page, void *shadow)
+{
+ struct radix_tree_node *node;
+ unsigned long index;
+ unsigned int offset;
+ unsigned int tag;
+ void **slot;
+
+ VM_BUG_ON(!PageLocked(page));
+
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+ if (shadow) {
+ mapping->nrshadows++;
+ /*
+ * Make sure the nrshadows update is committed before
+ * the nrpages update so that final truncate racing
+ * with reclaim does not see both counters 0 at the
+ * same time and miss a shadow entry.
+ */
+ smp_wmb();
+ }
+ mapping->nrpages--;
+
+ if (!node) {
+ /* Clear direct pointer tags in root node */
+ mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+ radix_tree_replace_slot(slot, shadow);
+ return;
+ }
+
+ /* Clear tree tags for the removed page */
+ index = page->index;
+ offset = index & RADIX_TREE_MAP_MASK;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+ if (test_bit(offset, node->tags[tag]))
+ radix_tree_tag_clear(&mapping->page_tree, index, tag);
+ }
+
+ /* Delete page, swap shadow entry */
+ radix_tree_replace_slot(slot, shadow);
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ return;
+
+ /*
+ * Track node that only contains shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already tracked. The
+ * list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_pages(node) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes, &node->private_list);
+ }
+}
+
/*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
- radix_tree_delete(&mapping->page_tree, page->index);
+ trace_mm_filemap_delete_from_page_cache(page);
+ /*
+ * if we're uptodate, flush out into the cleancache, otherwise
+ * invalidate any existing cleancache entries. We can't leave
+ * stale data around in the cleancache once our page is gone
+ */
+ if (PageUptodate(page) && PageMappedToDisk(page))
+ cleancache_put_page(page);
+ else
+ cleancache_invalidate_page(mapping, page);
+
+ page_cache_tree_delete(mapping, page, shadow);
+
page->mapping = NULL;
- mapping->nrpages--;
+ /* Leave page->index set: truncation lookup relies upon it */
+
__dec_zone_page_state(page, NR_FILE_PAGES);
+ if (PageSwapBacked(page))
+ __dec_zone_page_state(page, NR_SHMEM);
+ BUG_ON(page_mapped(page));
+
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ *
+ * Fix it up by doing a final dirty accounting check after
+ * having removed the page entirely.
+ */
+ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ }
}
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
- write_lock_irq(&mapping->tree_lock);
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
+ freepage = mapping->a_ops->freepage;
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, NULL);
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
+ page_cache_release(page);
}
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
{
- struct address_space *mapping;
- struct page *page;
-
- page = container_of((unsigned long *)word, struct page, flags);
-
- /*
- * page_mapping() is being called without PG_locked held.
- * Some knowledge of the state and use of the page is used to
- * reduce the requirements down to a memory barrier.
- * The danger here is of a stale page_mapping() return value
- * indicating a struct address_space different from the one it's
- * associated with when it is associated with one.
- * After smp_mb(), it's either the correct page_mapping() for
- * the page, or an old page_mapping() and the page's own
- * page_mapping() has gone NULL.
- * The ->sync_page() address_space operation must tolerate
- * page_mapping() going NULL. By an amazing coincidence,
- * this comes about because none of the users of the page
- * in the ->sync_page() methods make essential use of the
- * page_mapping(), merely passing the page down to the backing
- * device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page_private(page) is
- * of interest. When page_mapping() does go NULL, the entire
- * call stack gracefully ignores the page and returns.
- * -- wli
- */
- smp_mb();
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(page);
io_schedule();
return 0;
}
+static int sleep_on_page_killable(void *word)
+{
+ sleep_on_page(word);
+ return fatal_signal_pending(current) ? -EINTR : 0;
+}
+
+static int filemap_check_errors(struct address_space *mapping)
+{
+ int ret = 0;
+ /* Check for outstanding write errors */
+ if (test_bit(AS_ENOSPC, &mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &mapping->flags))
+ ret = -EIO;
+ return ret;
+}
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -190,7 +287,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
+ .nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
@@ -214,11 +311,12 @@ int filemap_fdatawrite(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
/**
* filemap_flush - mostly a non-blocking flush
@@ -234,27 +332,27 @@ int filemap_flush(struct address_space *mapping)
EXPORT_SYMBOL(filemap_flush);
/**
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping: target address_space
- * @start: beginning page index
- * @end: ending page index
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
*
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.
*/
-int wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+ loff_t end_byte)
{
+ pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret = 0;
- pgoff_t index;
+ int ret2, ret = 0;
- if (end < start)
- return 0;
+ if (end_byte < start_byte)
+ goto out;
pagevec_init(&pvec, 0);
- index = start;
while ((index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_WRITEBACK,
@@ -269,85 +367,20 @@ int wait_on_page_writeback_range(struct address_space *mapping,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
cond_resched();
}
+out:
+ ret2 = filemap_check_errors(mapping);
+ if (!ret)
+ ret = ret2;
- /* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
-
- return ret;
-}
-
-/**
- * sync_page_range - write and wait on all pages in the passed range
- * @inode: target inode
- * @mapping: target address_space
- * @pos: beginning offset in pages to write
- * @count: number of bytes to write
- *
- * Write and wait upon all the pages in the passed range. This is a "data
- * integrity" operation. It waits upon in-flight writeout before starting and
- * waiting upon new writeout. If there was an IO error, return it.
- *
- * We need to re-take i_mutex during the generic_osync_inode list walk because
- * it is otherwise livelockable.
- */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
- loff_t pos, loff_t count)
-{
- pgoff_t start = pos >> PAGE_CACHE_SHIFT;
- pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
- int ret;
-
- if (!mapping_cap_writeback_dirty(mapping) || !count)
- return 0;
- ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
- if (ret == 0) {
- mutex_lock(&inode->i_mutex);
- ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- mutex_unlock(&inode->i_mutex);
- }
- if (ret == 0)
- ret = wait_on_page_writeback_range(mapping, start, end);
- return ret;
-}
-EXPORT_SYMBOL(sync_page_range);
-
-/**
- * sync_page_range_nolock
- * @inode: target inode
- * @mapping: target address_space
- * @pos: beginning offset in pages to write
- * @count: number of bytes to write
- *
- * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
- * as it forces O_SYNC writers to different parts of the same file
- * to be serialised right until io completion.
- */
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
- loff_t pos, loff_t count)
-{
- pgoff_t start = pos >> PAGE_CACHE_SHIFT;
- pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
- int ret;
-
- if (!mapping_cap_writeback_dirty(mapping) || !count)
- return 0;
- ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
- if (ret == 0)
- ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- if (ret == 0)
- ret = wait_on_page_writeback_range(mapping, start, end);
return ret;
}
-EXPORT_SYMBOL(sync_page_range_nolock);
+EXPORT_SYMBOL(filemap_fdatawait_range);
/**
* filemap_fdatawait - wait for all under-writeback pages to complete
@@ -363,8 +396,7 @@ int filemap_fdatawait(struct address_space *mapping)
if (i_size == 0)
return 0;
- return wait_on_page_writeback_range(mapping, 0,
- (i_size - 1) >> PAGE_CACHE_SHIFT);
+ return filemap_fdatawait_range(mapping, 0, i_size - 1);
}
EXPORT_SYMBOL(filemap_fdatawait);
@@ -385,6 +417,8 @@ int filemap_write_and_wait(struct address_space *mapping)
if (!err)
err = err2;
}
+ } else {
+ err = filemap_check_errors(mapping);
}
return err;
}
@@ -411,89 +445,226 @@ int filemap_write_and_wait_range(struct address_space *mapping,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
if (err != -EIO) {
- int err2 = wait_on_page_writeback_range(mapping,
- lstart >> PAGE_CACHE_SHIFT,
- lend >> PAGE_CACHE_SHIFT);
+ int err2 = filemap_fdatawait_range(mapping,
+ lstart, lend);
if (!err)
err = err2;
}
+ } else {
+ err = filemap_check_errors(mapping);
}
return err;
}
+EXPORT_SYMBOL(filemap_write_and_wait_range);
+
+/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+
+ VM_BUG_ON_PAGE(!PageLocked(old), old);
+ VM_BUG_ON_PAGE(!PageLocked(new), new);
+ VM_BUG_ON_PAGE(new->mapping, new);
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old, NULL);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ /* mem_cgroup codes must not be called under tree_lock */
+ mem_cgroup_replace_page_cache(old, new);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+static int page_cache_tree_insert(struct address_space *mapping,
+ struct page *page, void **shadowp)
+{
+ struct radix_tree_node *node;
+ void **slot;
+ int error;
+
+ error = __radix_tree_create(&mapping->page_tree, page->index,
+ &node, &slot);
+ if (error)
+ return error;
+ if (*slot) {
+ void *p;
+
+ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ if (!radix_tree_exceptional_entry(p))
+ return -EEXIST;
+ if (shadowp)
+ *shadowp = p;
+ mapping->nrshadows--;
+ if (node)
+ workingset_node_shadows_dec(node);
+ }
+ radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
+ if (node) {
+ workingset_node_pages_inc(node);
+ /*
+ * Don't track node that contains actual pages.
+ *
+ * Avoid acquiring the list_lru lock if already
+ * untracked. The list_empty() test is safe as
+ * node->private_list is protected by
+ * mapping->tree_lock.
+ */
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ }
+ return 0;
+}
+
+static int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask,
+ void **shadowp)
+{
+ int error;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+
+ error = mem_cgroup_charge_file(page, current->mm,
+ gfp_mask & GFP_RECLAIM_MASK);
+ if (error)
+ return error;
+
+ error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ return error;
+ }
+
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = page_cache_tree_insert(mapping, page, shadowp);
+ radix_tree_preload_end();
+ if (unlikely(error))
+ goto err_insert;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ spin_unlock_irq(&mapping->tree_lock);
+ trace_mm_filemap_add_to_page_cache(page);
+ return 0;
+err_insert:
+ page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+ page_cache_release(page);
+ return error;
+}
/**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
* @offset: page index
* @gfp_mask: page allocation mode
*
- * This function is used to add newly allocated pagecache pages;
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
+ * This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
*/
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
- if (error == 0) {
- write_lock_irq(&mapping->tree_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
- if (!error) {
- page_cache_get(page);
- SetPageLocked(page);
- page->mapping = mapping;
- page->index = offset;
- mapping->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- }
- write_unlock_irq(&mapping->tree_lock);
- radix_tree_preload_end();
- }
- return error;
+ return __add_to_page_cache_locked(page, mapping, offset,
+ gfp_mask, NULL);
}
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
- if (ret == 0)
+ void *shadow = NULL;
+ int ret;
+
+ __set_page_locked(page);
+ ret = __add_to_page_cache_locked(page, mapping, offset,
+ gfp_mask, &shadow);
+ if (unlikely(ret))
+ __clear_page_locked(page);
+ else {
+ /*
+ * The page might have been evicted from cache only
+ * recently, in which case it should be activated like
+ * any other repeatedly accessed page.
+ */
+ if (shadow && workingset_refault(shadow)) {
+ SetPageActive(page);
+ workingset_activation(page);
+ } else
+ ClearPageActive(page);
lru_cache_add(page);
+ }
return ret;
}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
-struct page *page_cache_alloc(struct address_space *x)
+struct page *__page_cache_alloc(gfp_t gfp)
{
- if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, mapping_gfp_mask(x), 0);
- }
- return alloc_pages(mapping_gfp_mask(x), 0);
-}
-EXPORT_SYMBOL(page_cache_alloc);
+ int n;
+ struct page *page;
-struct page *page_cache_alloc_cold(struct address_space *x)
-{
if (cpuset_do_page_mem_spread()) {
- int n = cpuset_mem_spread_node();
- return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return page;
}
- return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+ return alloc_pages(gfp, 0);
}
-EXPORT_SYMBOL(page_cache_alloc_cold);
+EXPORT_SYMBOL(__page_cache_alloc);
#endif
-static int __sleep_on_page_lock(void *word)
-{
- io_schedule();
- return 0;
-}
-
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
@@ -516,16 +687,45 @@ static inline void wake_up_page(struct page *page, int bit)
__wake_up_bit(page_waitqueue(page), &page->flags, bit);
}
-void fastcall wait_on_page_bit(struct page *page, int bit_nr)
+void wait_on_page_bit(struct page *page, int bit_nr)
{
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page: Page defining the wait queue of interest
+ * @waiter: Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, waiter);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
/**
* unlock_page - unlock a locked page
* @page: the page
@@ -535,17 +735,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
* mechananism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
- * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
*/
-void fastcall unlock_page(struct page *page)
+void unlock_page(struct page *page)
{
- smp_mb__before_clear_bit();
- if (!TestClearPageLocked(page))
- BUG();
- smp_mb__after_clear_bit();
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ clear_bit_unlock(PG_locked, &page->flags);
+ smp_mb__after_atomic();
wake_up_page(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -556,167 +753,442 @@ EXPORT_SYMBOL(unlock_page);
*/
void end_page_writeback(struct page *page)
{
- if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
- if (!test_clear_page_writeback(page))
- BUG();
+ /*
+ * TestClearPageReclaim could be used here but it is an atomic
+ * operation and overkill in this particular case. Failing to
+ * shuffle a page marked for immediate reclaim is too mild to
+ * justify taking an atomic operation penalty at the end of
+ * ever page writeback.
+ */
+ if (PageReclaim(page)) {
+ ClearPageReclaim(page);
+ rotate_reclaimable_page(page);
}
- smp_mb__after_clear_bit();
+
+ if (!test_clear_page_writeback(page))
+ BUG();
+
+ smp_mb__after_atomic();
wake_up_page(page, PG_writeback);
}
EXPORT_SYMBOL(end_page_writeback);
+/*
+ * After completing I/O on a page, call this routine to update the page
+ * flags appropriately
+ */
+void page_endio(struct page *page, int rw, int err)
+{
+ if (rw == READ) {
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } else { /* rw == WRITE */
+ if (err) {
+ SetPageError(page);
+ if (page->mapping)
+ mapping_set_error(page->mapping, err);
+ }
+ end_page_writeback(page);
+ }
+}
+EXPORT_SYMBOL_GPL(page_endio);
+
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait. However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
-void fastcall __lock_page(struct page *page)
+void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
-/*
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void fastcall __lock_page_nosync(struct page *page)
+int __lock_page_killable(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
- TASK_UNINTERRUPTIBLE);
+
+ return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ sleep_on_page_killable, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(__lock_page_killable);
+
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * CAUTION! In this case, mmap_sem is not released
+ * even though return 0.
+ */
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ up_read(&mm->mmap_sem);
+ if (flags & FAULT_FLAG_KILLABLE)
+ wait_on_page_locked_killable(page);
+ else
+ wait_on_page_locked(page);
+ return 0;
+ } else {
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
+
+ ret = __lock_page_killable(page);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+ }
+}
+
+/**
+ * page_cache_next_hole - find the next hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
+ * lowest indexed hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'return - index >=
+ * max_scan' will be true). In rare cases of index wrap-around, 0 will
+ * be returned.
+ *
+ * page_cache_next_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 5, then subsequently a hole is created at
+ * index 10, page_cache_next_hole covering both indexes may return 10
+ * if called under rcu_read_lock.
+ */
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index++;
+ if (index == 0)
+ break;
+ }
+
+ return index;
}
+EXPORT_SYMBOL(page_cache_next_hole);
/**
- * find_get_page - find and get a page reference
+ * page_cache_prev_hole - find the prev hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search backwards in the range [max(index-max_scan+1, 0), index] for
+ * the first hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'index - return >=
+ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
+ * will be returned.
+ *
+ * page_cache_prev_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 10, then subsequently a hole is created at
+ * index 5, page_cache_prev_hole covering both indexes may return 5 if
+ * called under rcu_read_lock.
+ */
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ pgoff_t index, unsigned long max_scan)
+{
+ unsigned long i;
+
+ for (i = 0; i < max_scan; i++) {
+ struct page *page;
+
+ page = radix_tree_lookup(&mapping->page_tree, index);
+ if (!page || radix_tree_exceptional_entry(page))
+ break;
+ index--;
+ if (index == ULONG_MAX)
+ break;
+ }
+
+ return index;
+}
+EXPORT_SYMBOL(page_cache_prev_hole);
+
+/**
+ * find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned with an increased refcount.
*
- * Is there a pagecache struct page at the given (mapping, offset) tuple?
- * If yes, increment its refcount and return it; if no, return NULL.
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
+ *
+ * Otherwise, %NULL is returned.
*/
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
+ void **pagep;
struct page *page;
- read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
- read_unlock_irq(&mapping->tree_lock);
+ rcu_read_lock();
+repeat:
+ page = NULL;
+ pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
+ if (pagep) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ /*
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Return
+ * it without attempting to raise page count.
+ */
+ goto out;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+out:
+ rcu_read_unlock();
+
return page;
}
-EXPORT_SYMBOL(find_get_page);
+EXPORT_SYMBOL(find_get_entry);
/**
- * find_trylock_page - find and lock a page
+ * find_lock_entry - locate, pin and lock a page cache entry
* @mapping: the address_space to search
- * @offset: the page index
+ * @offset: the page cache index
+ *
+ * Looks up the page cache slot at @mapping & @offset. If there is a
+ * page cache page, it is returned locked and with an increased
+ * refcount.
+ *
+ * If the slot holds a shadow entry of a previously evicted page, or a
+ * swap entry from shmem/tmpfs, it is returned.
*
- * Same as find_get_page(), but trylock it instead of incrementing the count.
+ * Otherwise, %NULL is returned.
+ *
+ * find_lock_entry() may sleep.
*/
-struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
struct page *page;
- read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page && TestSetPageLocked(page))
- page = NULL;
- read_unlock_irq(&mapping->tree_lock);
+repeat:
+ page = find_get_entry(mapping, offset);
+ if (page && !radix_tree_exception(page)) {
+ lock_page(page);
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON_PAGE(page->index != offset, page);
+ }
return page;
}
-EXPORT_SYMBOL(find_trylock_page);
+EXPORT_SYMBOL(find_lock_entry);
/**
- * find_lock_page - locate, pin and lock a pagecache page
+ * pagecache_get_page - find and get a page reference
* @mapping: the address_space to search
* @offset: the page index
+ * @fgp_flags: PCG flags
+ * @cache_gfp_mask: gfp mask to use for the page cache data page allocation
+ * @radix_gfp_mask: gfp mask to use for radix tree node allocation
+ *
+ * Looks up the page cache slot at @mapping & @offset.
+ *
+ * PCG flags modify how the page is returned.
*
- * Locates the desired pagecache page, locks it, increments its reference
- * count and returns its address.
+ * FGP_ACCESSED: the page will be marked accessed
+ * FGP_LOCK: Page is return locked
+ * FGP_CREAT: If page is not present then a new page is allocated using
+ * @cache_gfp_mask and added to the page cache and the VM's LRU
+ * list. If radix tree nodes are allocated during page cache
+ * insertion then @radix_gfp_mask is used. The page is returned
+ * locked and with an increased refcount. Otherwise, %NULL is
+ * returned.
*
- * Returns zero if the page was not present. find_lock_page() may sleep.
+ * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
+ * if the GFP flags specified for FGP_CREAT are atomic.
+ *
+ * If there is a page cache page, it is returned with an increased refcount.
*/
-struct page *find_lock_page(struct address_space *mapping,
- unsigned long offset)
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
+ int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
{
struct page *page;
- read_lock_irq(&mapping->tree_lock);
repeat:
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page) {
- page_cache_get(page);
- if (TestSetPageLocked(page)) {
- read_unlock_irq(&mapping->tree_lock);
- __lock_page(page);
- read_lock_irq(&mapping->tree_lock);
+ page = find_get_entry(mapping, offset);
+ if (radix_tree_exceptional_entry(page))
+ page = NULL;
+ if (!page)
+ goto no_page;
- /* Has the page been truncated while we slept? */
- if (unlikely(page->mapping != mapping ||
- page->index != offset)) {
- unlock_page(page);
+ if (fgp_flags & FGP_LOCK) {
+ if (fgp_flags & FGP_NOWAIT) {
+ if (!trylock_page(page)) {
page_cache_release(page);
- goto repeat;
+ return NULL;
}
+ } else {
+ lock_page(page);
+ }
+
+ /* Has the page been truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON_PAGE(page->index != offset, page);
+ }
+
+ if (page && (fgp_flags & FGP_ACCESSED))
+ mark_page_accessed(page);
+
+no_page:
+ if (!page && (fgp_flags & FGP_CREAT)) {
+ int err;
+ if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ cache_gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS) {
+ cache_gfp_mask &= ~__GFP_FS;
+ radix_gfp_mask &= ~__GFP_FS;
+ }
+
+ page = __page_cache_alloc(cache_gfp_mask);
+ if (!page)
+ return NULL;
+
+ if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ fgp_flags |= FGP_LOCK;
+
+ /* Init accessed so avoit atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ init_page_accessed(page);
+
+ err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ page = NULL;
+ if (err == -EEXIST)
+ goto repeat;
}
}
- read_unlock_irq(&mapping->tree_lock);
+
return page;
}
-EXPORT_SYMBOL(find_lock_page);
+EXPORT_SYMBOL(pagecache_get_page);
/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
+ * find_get_entries - gang pagecache lookup
+ * @mapping: The address_space to search
+ * @start: The starting page cache index
+ * @nr_entries: The maximum number of entries
+ * @entries: Where the resulting entries are placed
+ * @indices: The cache indices corresponding to the entries in @entries
*
- * Locates a page in the pagecache. If the page is not present, a new page
- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
- * LRU list. The returned page is locked and has its reference count
- * incremented.
+ * find_get_entries() will search for and return a group of up to
+ * @nr_entries entries in the mapping. The entries are placed at
+ * @entries. find_get_entries() takes a reference against any actual
+ * pages it returns.
*
- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
- * allocation!
+ * The search returns a group of mapping-contiguous page cache entries
+ * with ascending indexes. There may be holes in the indices due to
+ * not-present pages.
*
- * find_or_create_page() returns the desired page's address, or zero on
- * memory exhaustion.
+ * Any shadow entries of evicted pages, or swap entries from
+ * shmem/tmpfs, are included in the returned array.
+ *
+ * find_get_entries() returns the number of pages and shadow entries
+ * which were found.
*/
-struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, gfp_t gfp_mask)
+unsigned find_get_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
{
- struct page *page, *cached_page = NULL;
- int err;
+ void **slot;
+ unsigned int ret = 0;
+ struct radix_tree_iter iter;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
- if (!cached_page) {
- cached_page = alloc_page(gfp_mask);
- if (!cached_page)
- return NULL;
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto restart;
+ /*
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Return
+ * it without attempting to raise page count.
+ */
+ goto export;
}
- err = add_to_page_cache_lru(cached_page, mapping,
- index, gfp_mask);
- if (!err) {
- page = cached_page;
- cached_page = NULL;
- } else if (err == -EEXIST)
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
goto repeat;
+ }
+export:
+ indices[ret] = iter.index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
}
- if (cached_page)
- page_cache_release(cached_page);
- return page;
+ rcu_read_unlock();
+ return ret;
}
-EXPORT_SYMBOL(find_or_create_page);
/**
* find_get_pages - gang pagecache lookup
@@ -737,15 +1209,55 @@ EXPORT_SYMBOL(find_or_create_page);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(iter.index);
+ goto restart;
+ }
+ /*
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Skip
+ * over it.
+ */
+ continue;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
+ }
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup(&mapping->page_tree,
- (void **)pages, start, nr_pages);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
- read_unlock_irq(&mapping->tree_lock);
+ rcu_read_unlock();
return ret;
}
@@ -764,22 +1276,67 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int ret = 0;
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup(&mapping->page_tree,
- (void **)pages, index, nr_pages);
- for (i = 0; i < ret; i++) {
- if (pages[i]->mapping == NULL || pages[i]->index != index)
+ if (unlikely(!nr_pages))
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ /* The hole, there no reason to continue */
+ if (unlikely(!page))
break;
- page_cache_get(pages[i]);
- index++;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * A shadow entry of a recently evicted page,
+ * or a swap entry from shmem/tmpfs. Stop
+ * looking for contiguous pages.
+ */
+ break;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != iter.index) {
+ page_cache_release(page);
+ break;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
}
- read_unlock_irq(&mapping->tree_lock);
- return i;
+ rcu_read_unlock();
+ return ret;
}
+EXPORT_SYMBOL(find_get_pages_contig);
/**
* find_get_pages_tag - find and return pages that match @tag
@@ -795,54 +1352,68 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
- read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
- (void **)pages, *index, nr_pages, tag);
- for (i = 0; i < ret; i++)
- page_cache_get(pages[i]);
- if (ret)
- *index = pages[ret - 1]->index + 1;
- read_unlock_irq(&mapping->tree_lock);
- return ret;
-}
+ if (unlikely(!nr_pages))
+ return 0;
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page, but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed. This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
-{
- struct page *page = find_get_page(mapping, index);
- gfp_t gfp_mask;
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ &iter, *index, tag) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
- if (page) {
- if (!TestSetPageLocked(page))
- return page;
- page_cache_release(page);
- return NULL;
- }
- gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
- page = alloc_pages(gfp_mask, 0);
- if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
- page_cache_release(page);
- page = NULL;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * A shadow entry of a recently evicted page.
+ *
+ * Those entries should never be tagged, but
+ * this tree walk is lockless and the tags are
+ * looked up in bulk, one radix tree node at a
+ * time, so there is a sizable window for page
+ * reclaim to evict a page we saw tagged.
+ *
+ * Skip over it.
+ */
+ continue;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[ret] = page;
+ if (++ret == nr_pages)
+ break;
}
- return page;
+
+ rcu_read_unlock();
+
+ if (ret)
+ *index = pages[ret - 1]->index + 1;
+
+ return ret;
}
-EXPORT_SYMBOL(grab_cache_page_nowait);
+EXPORT_SYMBOL(find_get_pages_tag);
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
@@ -862,92 +1433,105 @@ EXPORT_SYMBOL(grab_cache_page_nowait);
static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
{
- if (!ra->ra_pages)
- return;
-
ra->ra_pages /= 4;
}
/**
- * do_generic_mapping_read - generic file read routine
- * @mapping: address_space to be read
- * @_ra: file's readahead state
+ * do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
- * @desc: read_descriptor
- * @actor: read method
+ * @iter: data destination
+ * @written: already copied
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
- *
- * Note the struct file* is only passed for the use of readpage.
- * It may be NULL.
*/
-void do_generic_mapping_read(struct address_space *mapping,
- struct file_ra_state *_ra,
- struct file *filp,
- loff_t *ppos,
- read_descriptor_t *desc,
- read_actor_t actor)
+static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+ struct iov_iter *iter, ssize_t written)
{
+ struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- unsigned long index;
- unsigned long end_index;
- unsigned long offset;
- unsigned long last_index;
- unsigned long next_index;
- unsigned long prev_index;
- loff_t isize;
- struct page *cached_page;
- int error;
- struct file_ra_state ra = *_ra;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index;
+ pgoff_t last_index;
+ pgoff_t prev_index;
+ unsigned long offset; /* offset into pagecache page */
+ unsigned int prev_offset;
+ int error = 0;
- cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
- next_index = index;
- prev_index = ra.prev_page;
- last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
+ last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
for (;;) {
struct page *page;
+ pgoff_t end_index;
+ loff_t isize;
unsigned long nr, ret;
+ cond_resched();
+find_page:
+ page = find_get_page(mapping, index);
+ if (!page) {
+ page_cache_sync_readahead(mapping,
+ ra, filp,
+ index, last_index - index);
+ page = find_get_page(mapping, index);
+ if (unlikely(page == NULL))
+ goto no_cached_page;
+ }
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(mapping,
+ ra, filp, page,
+ index, last_index - index);
+ }
+ if (!PageUptodate(page)) {
+ if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ offset, iter->count))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ }
+page_ok:
+ /*
+ * i_size must be checked after we know the page is Uptodate.
+ *
+ * Checking i_size after the check allows us to calculate
+ * the correct value for "nr", which means the zero-filled
+ * part of the page is not copied back to userspace (unless
+ * another truncate extends the file - this is desired though).
+ */
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ goto out;
+ }
+
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
+ if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
+ page_cache_release(page);
goto out;
}
}
nr = nr - offset;
- cond_resched();
- if (index == next_index)
- next_index = page_cache_readahead(mapping, &ra, filp,
- index, last_index - index);
-
-find_page:
- page = find_get_page(mapping, index);
- if (unlikely(page == NULL)) {
- handle_ra_miss(mapping, &ra, index);
- goto no_cached_page;
- }
- if (!PageUptodate(page))
- goto page_not_up_to_date;
-page_ok:
-
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
@@ -956,37 +1540,41 @@ page_ok:
flush_dcache_page(page);
/*
- * When (part of) the same page is read multiple times
- * in succession, only mark it as accessed the first time.
+ * When a sequential read accesses a page several times,
+ * only mark it as accessed the first time.
*/
- if (prev_index != index)
+ if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
+
+ ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
+ prev_offset = offset;
page_cache_release(page);
- if (ret == nr && desc->count)
- continue;
- goto out;
+ written += ret;
+ if (!iov_iter_count(iter))
+ goto out;
+ if (ret < nr) {
+ error = -EFAULT;
+ goto out;
+ }
+ continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
- lock_page(page);
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
+page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
@@ -1001,66 +1589,49 @@ page_not_up_to_date:
}
readpage:
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
+ error = 0;
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
- lock_page(page);
+ error = lock_page_killable(page);
+ if (unlikely(error))
+ goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
- * invalidate_inode_pages got it
+ * invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
+ shrink_readahead_size_eio(filp, ra);
error = -EIO;
- shrink_readahead_size_eio(filp, &ra);
goto readpage_error;
}
unlock_page(page);
}
- /*
- * i_size must be checked after we have done ->readpage.
- *
- * Checking i_size after the readpage allows us to calculate
- * the correct value for "nr", which means the zero-filled
- * part of the page is not copied back to userspace (unless
- * another truncate extends the file - this is desired though).
- */
- isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
- goto out;
- }
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- page_cache_release(page);
- goto out;
- }
- }
- nr = nr - offset;
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
- desc->error = error;
page_cache_release(page);
goto out;
@@ -1069,261 +1640,93 @@ no_cached_page:
* Ok, it wasn't cached, so we need to create a new
* page..
*/
- if (!cached_page) {
- cached_page = page_cache_alloc_cold(mapping);
- if (!cached_page) {
- desc->error = -ENOMEM;
- goto out;
- }
+ page = page_cache_alloc_cold(mapping);
+ if (!page) {
+ error = -ENOMEM;
+ goto out;
}
- error = add_to_page_cache_lru(cached_page, mapping,
+ error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
- if (error == -EEXIST)
+ page_cache_release(page);
+ if (error == -EEXIST) {
+ error = 0;
goto find_page;
- desc->error = error;
+ }
goto out;
}
- page = cached_page;
- cached_page = NULL;
goto readpage;
}
out:
- *_ra = ra;
-
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- if (cached_page)
- page_cache_release(cached_page);
- if (filp)
- file_accessed(filp);
-}
-EXPORT_SYMBOL(do_generic_mapping_read);
-
-int file_read_actor(read_descriptor_t *desc, struct page *page,
- unsigned long offset, unsigned long size)
-{
- char *kaddr;
- unsigned long left, count = desc->count;
-
- if (size > count)
- size = count;
-
- /*
- * Faults on the destination of a read are common, so do it before
- * taking the kmap.
- */
- if (!fault_in_pages_writeable(desc->arg.buf, size)) {
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_to_user_inatomic(desc->arg.buf,
- kaddr + offset, size);
- kunmap_atomic(kaddr, KM_USER0);
- if (left == 0)
- goto success;
- }
-
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
- kunmap(page);
+ ra->prev_pos = prev_index;
+ ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos |= prev_offset;
- if (left) {
- size -= left;
- desc->error = -EFAULT;
- }
-success:
- desc->count = count - size;
- desc->written += size;
- desc->arg.buf += size;
- return size;
+ *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+ file_accessed(filp);
+ return written ? written : error;
}
/**
- * __generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
- * @iov: io vector request
- * @nr_segs: number of segments in the iovec
- * @ppos: current file position
+ * @iter: destination for the data read
*
- * This is the "read()" routine for all filesystems
+ * This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t
-__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct file *filp = iocb->ki_filp;
- ssize_t retval;
- unsigned long seg;
- size_t count;
-
- count = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *iv = &iov[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- count += iv->iov_len;
- if (unlikely((ssize_t)(count|iv->iov_len) < 0))
- return -EINVAL;
- if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
- continue;
- if (seg == 0)
- return -EFAULT;
- nr_segs = seg;
- count -= iv->iov_len; /* This segment is no good */
- break;
- }
+ struct file *file = iocb->ki_filp;
+ ssize_t retval = 0;
+ loff_t *ppos = &iocb->ki_pos;
+ loff_t pos = *ppos;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (filp->f_flags & O_DIRECT) {
- loff_t pos = *ppos, size;
- struct address_space *mapping;
- struct inode *inode;
-
- mapping = filp->f_mapping;
- inode = mapping->host;
- retval = 0;
+ if (file->f_flags & O_DIRECT) {
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
+ loff_t size;
+
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
- if (pos < size) {
- retval = generic_file_direct_IO(READ, iocb,
- iov, pos, nr_segs);
- if (retval > 0 && !is_sync_kiocb(iocb))
- retval = -EIOCBQUEUED;
- if (retval > 0)
- *ppos = pos + retval;
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + count - 1);
+ if (!retval) {
+ struct iov_iter data = *iter;
+ retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
}
- file_accessed(filp);
- goto out;
- }
- retval = 0;
- if (count) {
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
+ if (retval > 0) {
+ *ppos = pos + retval;
+ iov_iter_advance(iter, retval);
+ }
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_generic_file_read(filp,ppos,&desc,file_read_actor);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+ file_accessed(file);
+ goto out;
}
}
+
+ retval = do_generic_file_read(file, ppos, iter, retval);
out:
return retval;
}
-EXPORT_SYMBOL(__generic_file_aio_read);
-
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
-{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
- BUG_ON(iocb->ki_pos != pos);
- return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
-}
-EXPORT_SYMBOL(generic_file_aio_read);
-
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_read);
-
-int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
-{
- ssize_t written;
- unsigned long count = desc->count;
- struct file *file = desc->arg.data;
-
- if (size > count)
- size = count;
-
- written = file->f_op->sendpage(file, page, offset,
- size, &file->f_pos, size<count);
- if (written < 0) {
- desc->error = written;
- written = 0;
- }
- desc->count = count - written;
- desc->written += written;
- return written;
-}
-
-ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_generic_file_read(in_file, ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-EXPORT_SYMBOL(generic_file_sendfile);
-
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
- unsigned long index, unsigned long nr)
-{
- if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
- return -EINVAL;
-
- force_page_cache_readahead(mapping, filp, index,
- max_sane_readahead(nr));
- return 0;
-}
-
-asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
-{
- ssize_t ret;
- struct file *file;
-
- ret = -EBADF;
- file = fget(fd);
- if (file) {
- if (file->f_mode & FMODE_READ) {
- struct address_space *mapping = file->f_mapping;
- unsigned long start = offset >> PAGE_CACHE_SHIFT;
- unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
- unsigned long len = end - start + 1;
- ret = do_readahead(mapping, file, start, len);
- }
- fput(file);
- }
- return ret;
-}
+EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
/**
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
@@ -1332,7 +1735,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
+static int page_cache_read(struct file *file, pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
@@ -1358,128 +1761,163 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
#define MMAP_LOTSAMISS (100)
+/*
+ * Synchronous readahead happens when we don't even find
+ * a page in the page cache at all.
+ */
+static void do_sync_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ pgoff_t offset)
+{
+ unsigned long ra_pages;
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (vma->vm_flags & VM_RAND_READ)
+ return;
+ if (!ra->ra_pages)
+ return;
+
+ if (vma->vm_flags & VM_SEQ_READ) {
+ page_cache_sync_readahead(mapping, ra, file, offset,
+ ra->ra_pages);
+ return;
+ }
+
+ /* Avoid banging the cache line if not needed */
+ if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
+ ra->mmap_miss++;
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (ra->mmap_miss > MMAP_LOTSAMISS)
+ return;
+
+ /*
+ * mmap read-around
+ */
+ ra_pages = max_sane_readahead(ra->ra_pages);
+ ra->start = max_t(long, 0, offset - ra_pages / 2);
+ ra->size = ra_pages;
+ ra->async_size = ra_pages / 4;
+ ra_submit(ra, mapping, file);
+}
+
+/*
+ * Asynchronous readahead happens when we find the page and PG_readahead,
+ * so we want to possibly extend the readahead further..
+ */
+static void do_async_mmap_readahead(struct vm_area_struct *vma,
+ struct file_ra_state *ra,
+ struct file *file,
+ struct page *page,
+ pgoff_t offset)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ /* If we don't want any read-ahead, don't bother */
+ if (vma->vm_flags & VM_RAND_READ)
+ return;
+ if (ra->mmap_miss > 0)
+ ra->mmap_miss--;
+ if (PageReadahead(page))
+ page_cache_async_readahead(mapping, ra, file,
+ page, offset, ra->ra_pages);
+}
+
/**
- * filemap_nopage - read in file data for page fault handling
- * @area: the applicable vm_area
- * @address: target address to read in
- * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ * filemap_fault - read in file data for page fault handling
+ * @vma: vma in which the fault was taken
+ * @vmf: struct vm_fault containing details of the fault
*
- * filemap_nopage() is invoked via the vma operations vector for a
+ * filemap_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*/
-struct page *filemap_nopage(struct vm_area_struct *area,
- unsigned long address, int *type)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int error;
- struct file *file = area->vm_file;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
+ pgoff_t offset = vmf->pgoff;
struct page *page;
- unsigned long size, pgoff;
- int did_readaround = 0, majmin = VM_FAULT_MINOR;
-
- pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
-
-retry_all:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff >= size)
- goto outside_data_content;
-
- /* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(area))
- goto no_cached_page;
+ loff_t size;
+ int ret = 0;
- /*
- * The readahead code wants to be told about each and every page
- * so it can build and shrink its windows appropriately
- *
- * For sequential accesses, we use the generic readahead logic.
- */
- if (VM_SequentialReadHint(area))
- page_cache_readahead(mapping, ra, file, pgoff, 1);
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (offset >= size >> PAGE_CACHE_SHIFT)
+ return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
-retry_find:
- page = find_get_page(mapping, pgoff);
- if (!page) {
- unsigned long ra_pages;
-
- if (VM_SequentialReadHint(area)) {
- handle_ra_miss(mapping, ra, pgoff);
- goto no_cached_page;
- }
- ra->mmap_miss++;
-
- /*
- * Do we miss much more than hit in this file? If so,
- * stop bothering with read-ahead. It will only hurt.
- */
- if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
- goto no_cached_page;
-
+ page = find_get_page(mapping, offset);
+ if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
- * To keep the pgmajfault counter straight, we need to
- * check did_readaround, as this is an inner loop.
+ * We found the page, so try async readahead before
+ * waiting for the lock.
*/
- if (!did_readaround) {
- majmin = VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- }
- did_readaround = 1;
- ra_pages = max_sane_readahead(file->f_ra.ra_pages);
- if (ra_pages) {
- pgoff_t start = 0;
-
- if (pgoff > ra_pages / 2)
- start = pgoff - ra_pages / 2;
- do_page_cache_readahead(mapping, file, start, ra_pages);
- }
- page = find_get_page(mapping, pgoff);
+ do_async_mmap_readahead(vma, ra, file, page, offset);
+ } else if (!page) {
+ /* No page in the page cache at all */
+ do_sync_mmap_readahead(vma, ra, file, offset);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ ret = VM_FAULT_MAJOR;
+retry_find:
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
- if (!did_readaround)
- ra->mmap_hit++;
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON_PAGE(page->index != offset, page);
/*
- * Ok, found a page in the page cache, now we need to check
- * that it's up-to-date.
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. If not, it is going to be due to an error.
*/
- if (!PageUptodate(page))
+ if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
-success:
/*
* Found the page and have a reference on it.
+ * We must recheck i_size under page lock.
*/
- mark_page_accessed(page);
- if (type)
- *type = majmin;
- return page;
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
+ unlock_page(page);
+ page_cache_release(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf->page = page;
+ return ret | VM_FAULT_LOCKED;
-outside_data_content:
- /*
- * An external ptracer can access pages that normally aren't
- * accessible..
- */
- if (area->vm_mm == current->mm)
- return NULL;
- /* Fall through to the non-read-ahead case */
no_cached_page:
/*
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
- error = page_cache_read(file, pgoff);
- grab_swap_token();
+ error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
@@ -1495,256 +1933,138 @@ no_cached_page:
* to schedule I/O.
*/
if (error == -ENOMEM)
- return NOPAGE_OOM;
- return NULL;
+ return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
page_not_uptodate:
- if (!did_readaround) {
- majmin = VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- }
- lock_page(page);
-
- /* Did it get unhashed while we waited for it? */
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto retry_all;
- }
-
- /* Did somebody else get it up-to-date? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
-
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
- }
-
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
- lock_page(page);
-
- /* Somebody truncated the page on us? */
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto retry_all;
- }
-
- /* Somebody else successfully read it in? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
ClearPageError(page);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
+ if (!PageUptodate(page))
+ error = -EIO;
}
+ page_cache_release(page);
- /*
- * Things didn't work out. Return zero to tell the
- * mm layer so, possibly freeing the page cache page first.
- */
+ if (!error || error == AOP_TRUNCATED_PAGE)
+ goto retry_find;
+
+ /* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
- page_cache_release(page);
- return NULL;
+ return VM_FAULT_SIGBUS;
}
-EXPORT_SYMBOL(filemap_nopage);
+EXPORT_SYMBOL(filemap_fault);
-static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
- int nonblock)
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ loff_t size;
struct page *page;
- int error;
+ unsigned long address = (unsigned long) vmf->virtual_address;
+ unsigned long addr;
+ pte_t *pte;
- /*
- * Do we have something in the page cache already?
- */
-retry_find:
- page = find_get_page(mapping, pgoff);
- if (!page) {
- if (nonblock)
- return NULL;
- goto no_cached_page;
- }
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
+ if (iter.index > vmf->max_pgoff)
+ break;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ goto next;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ break;
+ else
+ goto next;
+ }
- /*
- * Ok, found a page in the page cache, now we need to check
- * that it's up-to-date.
- */
- if (!PageUptodate(page)) {
- if (nonblock) {
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
page_cache_release(page);
- return NULL;
+ goto repeat;
}
- goto page_not_uptodate;
- }
-success:
- /*
- * Found the page and have a reference on it.
- */
- mark_page_accessed(page);
- return page;
+ if (!PageUptodate(page) ||
+ PageReadahead(page) ||
+ PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(page))
+ goto skip;
-no_cached_page:
- error = page_cache_read(file, pgoff);
+ if (page->mapping != mapping || !PageUptodate(page))
+ goto unlock;
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
+ size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
+ if (page->index >= size >> PAGE_CACHE_SHIFT)
+ goto unlock;
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- return NULL;
-
-page_not_uptodate:
- lock_page(page);
+ pte = vmf->pte + page->index - vmf->pgoff;
+ if (!pte_none(*pte))
+ goto unlock;
- /* Did it get truncated while we waited for it? */
- if (!page->mapping) {
+ if (file->f_ra.mmap_miss > 0)
+ file->f_ra.mmap_miss--;
+ addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
+ do_set_pte(vma, addr, page, pte, false, false);
unlock_page(page);
- goto err;
- }
-
- /* Did somebody else get it up-to-date? */
- if (PageUptodate(page)) {
+ goto next;
+unlock:
unlock_page(page);
- goto success;
- }
-
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
+skip:
page_cache_release(page);
- goto retry_find;
+next:
+ if (iter.index == vmf->max_pgoff)
+ break;
}
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(filemap_map_pages);
- /*
- * Umm, take care of errors if the page isn't up-to-date.
- * Try to re-read it _once_. We do this synchronously,
- * because there really aren't any performance issues here
- * and we need to check for errors.
- */
- lock_page(page);
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret = VM_FAULT_LOCKED;
- /* Somebody truncated the page on us? */
- if (!page->mapping) {
- unlock_page(page);
- goto err;
- }
- /* Somebody else successfully read it in? */
- if (PageUptodate(page)) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
unlock_page(page);
- goto success;
- }
-
- ClearPageError(page);
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
+ ret = VM_FAULT_NOPAGE;
+ goto out;
}
-
/*
- * Things didn't work out. Return zero to tell the
- * mm layer so, possibly freeing the page cache page first.
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
*/
-err:
- page_cache_release(page);
-
- return NULL;
-}
-
-int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
- unsigned long len, pgprot_t prot, unsigned long pgoff,
- int nonblock)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- unsigned long size;
- struct mm_struct *mm = vma->vm_mm;
- struct page *page;
- int err;
-
- if (!nonblock)
- force_page_cache_readahead(mapping, vma->vm_file,
- pgoff, len >> PAGE_CACHE_SHIFT);
-
-repeat:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
- return -EINVAL;
-
- page = filemap_getpage(file, pgoff, nonblock);
-
- /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
- * done in shmem_populate calling shmem_getpage */
- if (!page && !nonblock)
- return -ENOMEM;
-
- if (page) {
- err = install_page(mm, vma, addr, page, prot);
- if (err) {
- page_cache_release(page);
- return err;
- }
- } else if (vma->vm_flags & VM_NONLINEAR) {
- /* No page was found just because we can't read it in now (being
- * here implies nonblock != 0), but the page may exist, so set
- * the PTE to fault it in later. */
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
- }
-
- len -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- if (len)
- goto repeat;
-
- return 0;
+ set_page_dirty(page);
+ wait_for_stable_page(page);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
-EXPORT_SYMBOL(filemap_populate);
+EXPORT_SYMBOL(filemap_page_mkwrite);
-struct vm_operations_struct generic_file_vm_ops = {
- .nopage = filemap_nopage,
- .populate = filemap_populate,
+const struct vm_operations_struct generic_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
@@ -1783,66 +2103,65 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
-static inline struct page *__read_cache_page(struct address_space *mapping,
- unsigned long index,
- int (*filler)(void *,struct page*),
- void *data)
+static struct page *wait_on_page_read(struct page *page)
{
- struct page *page, *cached_page = NULL;
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+static struct page *__read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data,
+ gfp_t gfp)
+{
+ struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
- if (!cached_page) {
- cached_page = page_cache_alloc_cold(mapping);
- if (!cached_page)
- return ERR_PTR(-ENOMEM);
- }
- err = add_to_page_cache_lru(cached_page, mapping,
- index, GFP_KERNEL);
- if (err == -EEXIST)
- goto repeat;
- if (err < 0) {
+ page = __page_cache_alloc(gfp | __GFP_COLD);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ err = add_to_page_cache_lru(page, mapping, index, gfp);
+ if (unlikely(err)) {
+ page_cache_release(page);
+ if (err == -EEXIST)
+ goto repeat;
/* Presumably ENOMEM for radix tree node */
- page_cache_release(cached_page);
return ERR_PTR(err);
}
- page = cached_page;
- cached_page = NULL;
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
}
}
- if (cached_page)
- page_cache_release(cached_page);
return page;
}
-/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: destination for read data
- *
- * Read into the page cache. If a page already exists,
- * and PageUptodate() is not set, try to fill the page.
- */
-struct page *read_cache_page(struct address_space *mapping,
- unsigned long index,
- int (*filler)(void *,struct page*),
- void *data)
+static struct page *do_read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data,
+ gfp_t gfp)
+
{
struct page *page;
int err;
retry:
- page = __read_cache_page(mapping, index, filler, data);
+ page = __read_cache_page(mapping, index, filler, data, gfp);
if (IS_ERR(page))
- goto out;
- mark_page_accessed(page);
+ return page;
if (PageUptodate(page))
goto out;
@@ -1859,102 +2178,58 @@ retry:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
+ return ERR_PTR(err);
+ } else {
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
}
- out:
+out:
+ mark_page_accessed(page);
return page;
}
-EXPORT_SYMBOL(read_cache_page);
-/*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec. This function is specifically for
- * generic_file_write().
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: first arg to filler(data, page) function, often left as NULL
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page and wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
*/
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
- struct page **cached_page, struct pagevec *lru_pvec)
+struct page *read_cache_page(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *, struct page *),
+ void *data)
{
- int err;
- struct page *page;
-repeat:
- page = find_lock_page(mapping, index);
- if (!page) {
- if (!*cached_page) {
- *cached_page = page_cache_alloc(mapping);
- if (!*cached_page)
- return NULL;
- }
- err = add_to_page_cache(*cached_page, mapping,
- index, GFP_KERNEL);
- if (err == -EEXIST)
- goto repeat;
- if (err == 0) {
- page = *cached_page;
- page_cache_get(page);
- if (!pagevec_add(lru_pvec, page))
- __pagevec_lru_add(lru_pvec);
- *cached_page = NULL;
- }
- }
- return page;
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
}
+EXPORT_SYMBOL(read_cache_page);
-/*
- * The logic we want is
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags.
*
- * if suid or (sgid and xgrp)
- * remove privs
+ * If the page does not get brought uptodate, return -EIO.
*/
-int remove_suid(struct dentry *dentry)
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
{
- mode_t mode = dentry->d_inode->i_mode;
- int kill = 0;
- int result = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID))) {
- struct iattr newattrs;
-
- newattrs.ia_valid = ATTR_FORCE | kill;
- result = notify_change(dentry, &newattrs);
- }
- return result;
-}
-EXPORT_SYMBOL(remove_suid);
-
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
+ return do_read_cache_page(mapping, index, filler, NULL, gfp);
}
+EXPORT_SYMBOL(read_cache_page_gfp);
/*
* Performs necessary checks before doing a write
@@ -1966,7 +2241,7 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
{
struct inode *inode = file->f_mapping->host;
- unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ unsigned long limit = rlimit(RLIMIT_FSIZE);
if (unlikely(*pos < 0))
return -EINVAL;
@@ -1993,7 +2268,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos + *count > MAX_NON_LFS &&
!(file->f_flags & O_LARGEFILE))) {
if (*pos >= MAX_NON_LFS) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
if (*count > MAX_NON_LFS - (unsigned long)*pos) {
@@ -2011,7 +2285,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (likely(!isblk)) {
if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
if (*count || *pos > inode->i_sb->s_maxbytes) {
- send_sig(SIGXFSZ, current, 0);
return -EFBIG;
}
/* zero-length writes at ->s_maxbytes are OK */
@@ -2020,6 +2293,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
*count = inode->i_sb->s_maxbytes - *pos;
} else {
+#ifdef CONFIG_BLOCK
loff_t isize;
if (bdev_read_only(I_BDEV(inode)))
return -EPERM;
@@ -2031,253 +2305,245 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
if (*pos + *count > isize)
*count = isize - *pos;
+#else
+ return -EPERM;
+#endif
}
return 0;
}
EXPORT_SYMBOL(generic_write_checks);
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ const struct address_space_operations *aops = mapping->a_ops;
+
+ return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long *nr_segs, loff_t pos, loff_t *ppos,
- size_t count, size_t ocount)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t written;
+ size_t write_len;
+ pgoff_t end;
+ struct iov_iter data;
- if (count != ocount)
- *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+ write_len = iov_iter_count(from);
+ end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
- written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
- if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
- mark_inode_dirty(inode);
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+ if (written)
+ goto out;
+
+ /*
+ * After a write we want buffered reads to be sure to go to disk to get
+ * the new data. We invalidate clean cached page from the region we're
+ * about to write. We do this *before* the write so that we can return
+ * without clobbering -EIOCBQUEUED from ->direct_IO().
+ */
+ if (mapping->nrpages) {
+ written = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
}
- *ppos = end;
}
+ data = *from;
+ written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+
/*
- * Sync the fs metadata but not the minor inode changes and
- * of course not the data as we did direct DMA for the IO.
- * i_mutex is held, which protects generic_osync_inode() from
- * livelocking.
+ * Finally, try again to invalidate clean pages which might have been
+ * cached by non-direct readahead, or faulted in by get_user_pages()
+ * if the source of the write was an mmap'ed region of the file
+ * we're writing. Either one is a pretty crazy thing to do,
+ * so we don't support it 100%. If this invalidation
+ * fails, tough, the write still worked...
*/
- if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- if (err < 0)
- written = err;
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
}
- if (written == count && !is_sync_kiocb(iocb))
- written = -EIOCBQUEUED;
+
+ if (written > 0) {
+ pos += written;
+ iov_iter_advance(from, written);
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ iocb->ki_pos = pos;
+ }
+out:
return written;
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/*
+ * Find or create a page at the given pagecache position. Return the locked
+ * page. This function is specifically for buffered writes.
+ */
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
+ struct page *page;
+ int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
- pagevec_init(&lru_pvec, 0);
+ if (flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+
+ page = pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping),
+ GFP_KERNEL);
+ if (page)
+ wait_for_stable_page(page);
+
+ return page;
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+ssize_t generic_perform_write(struct file *file,
+ struct iov_iter *i, loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ long status = 0;
+ ssize_t written = 0;
+ unsigned int flags = 0;
/*
- * handle partial DIO write. Adjust cur_iov if needed.
+ * Copies from kernel address space cannot fail (NFSD is a big user).
*/
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
+ if (segment_eq(get_fs(), KERNEL_DS))
+ flags |= AOP_FLAG_UNINTERRUPTIBLE;
do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
+ struct page *page;
+ unsigned long offset; /* Offset into pagecache page */
+ unsigned long bytes; /* Bytes to write to page */
+ size_t copied; /* Bytes copied from user */
+ void *fsdata;
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ offset = (pos & (PAGE_CACHE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(i));
+again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
+ *
+ * Not only is this an optimisation, but it is also required
+ * to check that the address is actually valid, when atomic
+ * usercopies are used, below.
*/
- fault_in_pages_readable(buf, bytes);
-
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
break;
}
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
+ status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ &page, &fsdata);
+ if (unlikely(status < 0))
+ break;
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
- if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
- if (status != AOP_TRUNCATED_PAGE)
- unlock_page(page);
- page_cache_release(page);
- if (status == AOP_TRUNCATED_PAGE)
- continue;
- /*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
- */
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
- }
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+ copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
flush_dcache_page(page);
- status = a_ops->commit_write(file, page, offset, offset+bytes);
- if (status == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- continue;
- }
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
- filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
+
+ status = a_ops->write_end(file, mapping, pos, bytes, copied,
+ page, fsdata);
+ if (unlikely(status < 0))
break;
- balance_dirty_pages_ratelimited(mapping);
+ copied = status;
+
cond_resched();
- } while (count);
- *ppos = pos;
- if (cached_page)
- page_cache_release(cached_page);
+ iov_iter_advance(i, copied);
+ if (unlikely(copied == 0)) {
+ /*
+ * If we were unable to copy any data at all, we must
+ * fall back to a single segment length write.
+ *
+ * If we didn't fallback here, we could livelock
+ * because not all segments in the iov can be copied at
+ * once without a pagefault.
+ */
+ bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+ pos += copied;
+ written += copied;
- /*
- * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
- */
- if (likely(status >= 0)) {
- if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- if (!a_ops->writepage || !is_sync_kiocb(iocb))
- status = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
}
- }
-
- /*
- * If we get here for O_DIRECT writes then we must have fallen through
- * to buffered writes (block instantiation inside i_size). So we sync
- * the file data here, to try to honour O_DIRECT expectations.
- */
- if (unlikely(file->f_flags & O_DIRECT) && written)
- status = filemap_write_and_wait(mapping);
+ } while (iov_iter_count(i));
- pagevec_lru_add(&lru_pvec);
return written ? written : status;
}
-EXPORT_SYMBOL(generic_file_buffered_write);
+EXPORT_SYMBOL(generic_perform_write);
-static ssize_t
-__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+/**
+ * __generic_file_write_iter - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * It expects i_mutex to be grabbed unless we work on a block device or similar
+ * object which does not need locking at all.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_mutex.
+ */
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- const struct address_space * mapping = file->f_mapping;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
+ struct address_space * mapping = file->f_mapping;
struct inode *inode = mapping->host;
- unsigned long seg;
- loff_t pos;
- ssize_t written;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
ssize_t err;
-
- ocount = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *iv = &iov[seg];
-
- /*
- * If any segment has a negative length, or the cumulative
- * length ever wraps negative then return -EINVAL.
- */
- ocount += iv->iov_len;
- if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
- return -EINVAL;
- if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
- continue;
- if (seg == 0)
- return -EFAULT;
- nr_segs = seg;
- ocount -= iv->iov_len; /* This segment is no good */
- break;
- }
-
- count = ocount;
- pos = *ppos;
-
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ ssize_t status;
+ size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
- written = 0;
-
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2285,209 +2551,131 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
- err = remove_suid(file->f_dentry);
+ iov_iter_truncate(from, count);
+
+ err = file_remove_suid(file);
if (err)
goto out;
- file_update_time(file);
+ err = file_update_time(file);
+ if (err)
+ goto out;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
- written = generic_file_direct_write(iocb, iov,
- &nr_segs, pos, ppos, count, ocount);
+ loff_t endbyte;
+
+ written = generic_file_direct_write(iocb, from, pos);
if (written < 0 || written == count)
goto out;
+
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
- }
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, ppos, count, written);
+ status = generic_perform_write(file, from, pos);
+ /*
+ * If generic_perform_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (unlikely(status < 0) && !written) {
+ err = status;
+ goto out;
+ }
+ iocb->ki_pos = pos + status;
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + status - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (err == 0) {
+ written += status;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+ } else {
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ }
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
+EXPORT_SYMBOL(__generic_file_write_iter);
-ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
- loff_t pos = *ppos;
-
- ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
-
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- err = sync_page_range_nolock(inode, mapping, pos, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-
-static ssize_t
-__generic_file_write_nolock(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, file);
- ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-
-ssize_t
-generic_file_write_nolock(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, file);
- ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_write_nolock);
-
-ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
- size_t count, loff_t pos)
+/**
+ * generic_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * This is a wrapper around __generic_file_write_iter() to be used by most
+ * filesystems. It takes care of syncing the file in case of O_SYNC file
+ * and acquires i_mutex as needed.
+ */
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = file->f_mapping->host;
ssize_t ret;
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
-
- BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
- ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
- &iocb->ki_pos);
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ if (ret > 0) {
ssize_t err;
- err = sync_page_range(inode, mapping, pos, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-EXPORT_SYMBOL(generic_file_aio_write);
-
-ssize_t generic_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
- struct iovec local_iov = { .iov_base = (void __user *)buf,
- .iov_len = count };
-
- mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- ssize_t err;
-
- err = sync_page_range(inode, mapping, *ppos - ret, ret);
- if (err < 0)
- ret = err;
- }
- return ret;
-}
-EXPORT_SYMBOL(generic_file_write);
-
-ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- return ret;
-}
-EXPORT_SYMBOL(generic_file_readv);
-
-ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
-{
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- err = sync_page_range(inode, mapping, *ppos - ret, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
return ret;
}
-EXPORT_SYMBOL(generic_file_writev);
+EXPORT_SYMBOL(generic_file_write_iter);
-/*
- * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
- * went wrong during pagecache shootdown.
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private). If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ *
*/
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- ssize_t retval;
- size_t write_len = 0;
+ struct address_space * const mapping = page->mapping;
- /*
- * If it's a write, unmap all mmappings of the file up-front. This
- * will cause any pte dirty bits to be propagated into the pageframes
- * for the subsequent filemap_write_and_wait().
- */
- if (rw == WRITE) {
- write_len = iov_length(iov, nr_segs);
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, offset, write_len, 0);
- }
+ BUG_ON(!PageLocked(page));
+ if (PageWriteback(page))
+ return 0;
- retval = filemap_write_and_wait(mapping);
- if (retval == 0) {
- retval = mapping->a_ops->direct_IO(rw, iocb, iov,
- offset, nr_segs);
- if (rw == WRITE && mapping->nrpages) {
- pgoff_t end = (offset + write_len - 1)
- >> PAGE_CACHE_SHIFT;
- int err = invalidate_inode_pages2_range(mapping,
- offset >> PAGE_CACHE_SHIFT, end);
- if (err)
- retval = err;
- }
- }
- return retval;
+ if (mapping && mapping->a_ops->releasepage)
+ return mapping->a_ops->releasepage(page, gfp_mask);
+ return try_to_free_buffers(page);
}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/filemap.h b/mm/filemap.h
deleted file mode 100644
index 3f2a343c601..00000000000
--- a/mm/filemap.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * linux/mm/filemap.h
- *
- * Copyright (C) 1994-1999 Linus Torvalds
- */
-
-#ifndef __FILEMAP_H
-#define __FILEMAP_H
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uio.h>
-#include <linux/config.h>
-#include <linux/uaccess.h>
-
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
- const struct iovec *iov,
- size_t base,
- size_t bytes);
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied. If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- *
- * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
- * to *NOT* zero any tail of the buffer that it failed to copy. If it does,
- * and if the following non-atomic copy succeeds, then there is a small window
- * where the target page contains neither the data before the write, nor the
- * data after the write (it contains zero). A read at this time will see
- * data that is inconsistent with any ordering of the read and the write.
- * (This has been detected in practice).
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
- const char __user *buf, unsigned bytes)
-{
- char *kaddr;
- int left;
-
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
-
- if (left != 0) {
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
- kunmap(page);
- }
- return bytes - left;
-}
-
-/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap_atomic(page, KM_USER0);
- copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
- base, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- if (copied != bytes) {
- kaddr = kmap(page);
- copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
- base, bytes);
- if (bytes - copied)
- memset(kaddr + offset + copied, 0, bytes - copied);
- kunmap(page);
- }
- return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t base = *basep;
-
- do {
- int copy = min(bytes, iov->iov_len - base);
-
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- base = 0;
- }
- } while (bytes);
- *iovp = iov;
- *basep = base;
-}
-#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b4fd0d7c9bf..d8d9fe3f685 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,44 +10,74 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
#include <asm/tlbflush.h>
-#include "filemap.h"
+#include <asm/io.h>
+
+/*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
+static struct page *__xip_sparse_page;
+
+/* called under xip_sparse_mutex */
+static struct page *xip_sparse_page(void)
+{
+ if (!__xip_sparse_page) {
+ struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+ if (page)
+ __xip_sparse_page = page;
+ }
+ return __xip_sparse_page;
+}
/*
* This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
* stuff.
*
* Note the struct file* is not used at all. It may be NULL.
*/
-static void
+static ssize_t
do_xip_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
- loff_t *ppos,
- read_descriptor_t *desc,
- read_actor_t actor)
+ char __user *buf,
+ size_t len,
+ loff_t *ppos)
{
struct inode *inode = mapping->host;
- unsigned long index, end_index, offset;
- loff_t isize;
+ pgoff_t index, end_index;
+ unsigned long offset;
+ loff_t isize, pos;
+ size_t copied = 0, error = 0;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
- index = *ppos >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ pos = *ppos;
+ index = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_CACHE_MASK;
isize = i_size_read(inode);
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- for (;;) {
- struct page *page;
- unsigned long nr, ret;
+ do {
+ unsigned long nr, left;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int zero = 0;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
@@ -60,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping,
}
}
nr = nr - offset;
+ if (nr > len - copied)
+ nr = len - copied;
- page = mapping->a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (!page)
- goto no_xip_page;
- if (unlikely(IS_ERR(page))) {
- if (PTR_ERR(page) == -ENODATA) {
+ error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(error)) {
+ if (error == -ENODATA) {
/* sparse */
- page = ZERO_PAGE(0);
- } else {
- desc->error = PTR_ERR(page);
+ zero = 1;
+ } else
goto out;
- }
}
/* If users can be writing to this page using arbitrary
@@ -80,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping,
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ /* address based flush */ ;
/*
- * Ok, we have the page, so now we can copy it to user space...
+ * Ok, we have the mem, so now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
@@ -91,78 +119,47 @@ do_xip_mapping_read(struct address_space *mapping,
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
- offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ if (!zero)
+ left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+ else
+ left = __clear_user(buf + copied, nr);
- if (ret == nr && desc->count)
- continue;
- goto out;
+ if (left) {
+ error = -EFAULT;
+ goto out;
+ }
-no_xip_page:
- /* Did not get the page. Report it */
- desc->error = -EIO;
- goto out;
- }
+ copied += (nr - left);
+ offset += (nr - left);
+ index += offset >> PAGE_CACHE_SHIFT;
+ offset &= ~PAGE_CACHE_MASK;
+ } while (copied < len);
out:
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = pos + copied;
if (filp)
file_accessed(filp);
+
+ return (copied ? copied : error);
}
ssize_t
xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
- read_descriptor_t desc;
-
if (!access_ok(VERIFY_WRITE, buf, len))
return -EFAULT;
- desc.written = 0;
- desc.arg.buf = buf;
- desc.count = len;
- desc.error = 0;
-
- do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
- ppos, &desc, file_read_actor);
-
- if (desc.written)
- return desc.written;
- else
- return desc.error;
+ return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+ buf, len, ppos);
}
EXPORT_SYMBOL_GPL(xip_file_read);
-ssize_t
-xip_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
- ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-EXPORT_SYMBOL_GPL(xip_file_sendfile);
-
/*
* __xip_unmap is invoked from xip_unmap and
* xip_write
*
* This function walks all vmas of the address_space and unmaps the
- * ZERO_PAGE when found at pgoff. Should it go in rmap.c?
+ * __xip_sparse_page when found at pgoff.
*/
static void
__xip_unmap (struct address_space * mapping,
@@ -170,100 +167,155 @@ __xip_unmap (struct address_space * mapping,
{
struct vm_area_struct *vma;
struct mm_struct *mm;
- struct prio_tree_iter iter;
unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
struct page *page;
+ unsigned count;
+ int locked = 0;
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ count = read_seqcount_begin(&xip_sparse_seq);
+
+ page = __xip_sparse_page;
+ if (!page)
+ return;
+
+retry:
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- page = ZERO_PAGE(address);
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
page_remove_rmap(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
+ /* must invalidate_page _before_ freeing the page */
+ mmu_notifier_invalidate_page(mm, address);
page_cache_release(page);
}
}
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (locked) {
+ mutex_unlock(&xip_sparse_mutex);
+ } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+ mutex_lock(&xip_sparse_mutex);
+ locked = 1;
+ goto retry;
+ }
}
/*
- * xip_nopage() is invoked via the vma operations vector for a
+ * xip_fault() is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault.
*
- * This function is derived from filemap_nopage, but used for execute in place
+ * This function is derived from filemap_fault, but used for execute in place
*/
-static struct page *
-xip_file_nopage(struct vm_area_struct * area,
- unsigned long address,
- int *type)
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct file *file = area->vm_file;
+ struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ pgoff_t size;
+ void *xip_mem;
+ unsigned long xip_pfn;
struct page *page;
- unsigned long size, pgoff, endoff;
-
- pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
- + area->vm_pgoff;
- endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
- + area->vm_pgoff;
+ int error;
+ /* XXX: are VM_FAULT_ codes OK? */
+again:
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (pgoff >= size) {
- return NULL;
- }
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
- page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
- if (!IS_ERR(page)) {
- goto out;
- }
- if (PTR_ERR(page) != -ENODATA)
- return NULL;
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (likely(!error))
+ goto found;
+ if (error != -ENODATA)
+ return VM_FAULT_OOM;
/* sparse block */
- if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
- (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+ if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+ (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
(!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+ int err;
+
/* maybe shared writable, allocate new block */
- page = mapping->a_ops->get_xip_page (mapping,
- pgoff*(PAGE_SIZE/512), 1);
- if (IS_ERR(page))
- return NULL;
- /* unmap page at pgoff from all other vmas */
- __xip_unmap(mapping, pgoff);
+ mutex_lock(&xip_sparse_mutex);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (error)
+ return VM_FAULT_SIGBUS;
+ /* unmap sparse mappings at pgoff from all other vmas */
+ __xip_unmap(mapping, vmf->pgoff);
+
+found:
+ err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+ xip_pfn);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ /*
+ * err == -EBUSY is fine, we've raced against another thread
+ * that faulted-in the same page
+ */
+ if (err != -EBUSY)
+ BUG_ON(err);
+ return VM_FAULT_NOPAGE;
} else {
- /* not shared and writable, use ZERO_PAGE() */
- page = ZERO_PAGE(address);
- }
+ int err, ret = VM_FAULT_OOM;
+
+ mutex_lock(&xip_sparse_mutex);
+ write_seqcount_begin(&xip_sparse_seq);
+ error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(!error)) {
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+ goto again;
+ }
+ if (error != -ENODATA)
+ goto out;
+ /* not shared and writable, use xip_sparse_page() */
+ page = xip_sparse_page();
+ if (!page)
+ goto out;
+ err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+ page);
+ if (err == -ENOMEM)
+ goto out;
+ ret = VM_FAULT_NOPAGE;
out:
- page_cache_get(page);
- return page;
+ write_seqcount_end(&xip_sparse_seq);
+ mutex_unlock(&xip_sparse_mutex);
+
+ return ret;
+ }
}
-static struct vm_operations_struct xip_file_vm_ops = {
- .nopage = xip_file_nopage,
+static const struct vm_operations_struct xip_file_vm_ops = {
+ .fault = xip_file_fault,
+ .page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
{
- BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+ BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
@@ -276,16 +328,17 @@ __xip_file_write(struct file *filp, const char __user *buf,
const struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
- struct page *page;
size_t bytes;
ssize_t written = 0;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
do {
unsigned long index;
unsigned long offset;
size_t copied;
+ void *xip_mem;
+ unsigned long xip_pfn;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
index = pos >> PAGE_CACHE_SHIFT;
@@ -293,32 +346,25 @@ __xip_file_write(struct file *filp, const char __user *buf,
if (bytes > count)
bytes = count;
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- fault_in_pages_readable(buf, bytes);
-
- page = a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+ status = a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (status == -ENODATA) {
/* we allocate a new page unmap it */
- page = a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 1);
- if (!IS_ERR(page))
+ mutex_lock(&xip_sparse_mutex);
+ status = a_ops->get_xip_mem(mapping, index, 1,
+ &xip_mem, &xip_pfn);
+ mutex_unlock(&xip_sparse_mutex);
+ if (!status)
/* unmap page at pgoff from all other vmas */
__xip_unmap(mapping, index);
}
- if (IS_ERR(page)) {
- status = PTR_ERR(page);
+ if (status)
break;
- }
- copied = filemap_copy_from_user(page, offset, buf, bytes);
- flush_dcache_page(page);
+ copied = bytes -
+ __copy_from_user_nocache(xip_mem + offset, buf, bytes);
+
if (likely(copied > 0)) {
status = copied;
@@ -368,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
pos = *ppos;
count = len;
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -379,11 +423,13 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (count == 0)
goto out_backing;
- ret = remove_suid(filp->f_dentry);
+ ret = file_remove_suid(filp);
if (ret)
goto out_backing;
- file_update_time(filp);
+ ret = file_update_time(filp);
+ if (ret)
+ goto out_backing;
ret = __xip_file_write (filp, buf, count, pos, ppos);
@@ -397,7 +443,7 @@ EXPORT_SYMBOL_GPL(xip_file_write);
/*
* truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_page
+ * functionality is analog to block_truncate_page but does use get_xip_mem
* to get the page instead of page cache
*/
int
@@ -407,10 +453,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize;
unsigned length;
- struct page *page;
- void *kaddr;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int err;
- BUG_ON(!mapping->a_ops->get_xip_page);
+ BUG_ON(!mapping->a_ops->get_xip_mem);
blocksize = 1 << mapping->host->i_blkbits;
length = offset & (blocksize - 1);
@@ -421,22 +468,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
length = blocksize - length;
- page = mapping->a_ops->get_xip_page(mapping,
- index*(PAGE_SIZE/512), 0);
- if (!page)
- return -ENOMEM;
- if (unlikely(IS_ERR(page))) {
- if (PTR_ERR(page) == -ENODATA)
+ err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+ &xip_mem, &xip_pfn);
+ if (unlikely(err)) {
+ if (err == -ENODATA)
/* Hole? No need to truncate */
return 0;
else
- return PTR_ERR(page);
+ return err;
}
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr + offset, 0, length);
- kunmap_atomic(kaddr, KM_USER0);
-
- flush_dcache_page(page);
+ memset(xip_mem + offset, 0, length);
return 0;
}
EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index aa30618ec6b..72b8fa36143 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,8 @@
*
* started by Ingo Molnar, Copyright (C) 2002, 2003
*/
-
+#include <linux/export.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -13,18 +14,26 @@
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
-#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+#include "internal.h"
+
+static int mm_counter(struct page *page)
+{
+ return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
+}
+
+static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
- struct page *page = NULL;
+ struct page *page;
+ swp_entry_t entry;
if (pte_present(pte)) {
flush_cache_page(vma, addr, pte_pfn(pte));
@@ -33,88 +42,50 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, mm_counter(page));
page_remove_rmap(page);
page_cache_release(page);
}
- } else {
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
- pte_clear(mm, addr, ptep);
+ } else { /* zap_pte() is not called when pte_none() */
+ if (!pte_file(pte)) {
+ update_hiwater_rss(mm);
+ entry = pte_to_swp_entry(pte);
+ if (non_swap_entry(entry)) {
+ if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+ dec_mm_counter(mm, mm_counter(page));
+ }
+ } else {
+ free_swap_and_cache(entry);
+ dec_mm_counter(mm, MM_SWAPENTS);
+ }
+ }
+ pte_clear_not_present_full(mm, addr, ptep, 0);
}
- return !!page;
}
/*
- * Install a file page to a given virtual memory address, release any
- * previously existing mapping.
- */
-int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, struct page *page, pgprot_t prot)
-{
- struct inode *inode;
- pgoff_t size;
- int err = -ENOMEM;
- pte_t *pte;
- pte_t pte_val;
- spinlock_t *ptl;
-
- pte = get_locked_pte(mm, addr, &ptl);
- if (!pte)
- goto out;
-
- /*
- * This page may have been truncated. Tell the
- * caller about it.
- */
- err = -EINVAL;
- inode = vma->vm_file->f_mapping->host;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (!page->mapping || page->index >= size)
- goto unlock;
- err = -ENOMEM;
- if (page_mapcount(page) > INT_MAX/2)
- goto unlock;
-
- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
- inc_mm_counter(mm, file_rss);
-
- flush_icache_page(vma, page);
- pte_val = mk_pte(page, prot);
- set_pte_at(mm, addr, pte, pte_val);
- page_add_file_rmap(page);
- update_mmu_cache(vma, addr, pte_val);
- lazy_mmu_prot_update(pte_val);
- err = 0;
-unlock:
- pte_unmap_unlock(pte, ptl);
-out:
- return err;
-}
-EXPORT_SYMBOL(install_page);
-
-/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
*/
-int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long pgoff, pgprot_t prot)
{
int err = -ENOMEM;
- pte_t *pte;
- pte_t pte_val;
+ pte_t *pte, ptfile;
spinlock_t *ptl;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
- if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
- update_hiwater_rss(mm);
- dec_mm_counter(mm, file_rss);
- }
+ ptfile = pgoff_to_pte(pgoff);
+
+ if (!pte_none(*pte))
+ zap_pte(mm, vma, addr, pte);
- set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
- pte_val = *pte;
+ set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
/*
* We don't need to run update_mmu_cache() here because the "file pte"
* being installed by install_file_pte() is not a real pte - it's a
@@ -128,35 +99,61 @@ out:
return err;
}
-/***
- * sys_remap_file_pages - remap arbitrary pages of a shared backing store
- * file within an existing vma.
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ do {
+ err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+ if (err)
+ return err;
+
+ size -= PAGE_SIZE;
+ addr += PAGE_SIZE;
+ pgoff++;
+ } while (size);
+
+ return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
+/**
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
* @start: start of the remapped virtual memory range
* @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range
- * @pgoff: to be mapped page of the backing store file
+ * @prot: new protection bits of the range (see NOTE)
+ * @pgoff: to-be-mapped page of the backing store file
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
*
- * this syscall works purely via pagetables, so it's the most efficient
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
* way to map the same (large) file into a given virtual window. Unlike
* mmap()/mremap() it does not create any new vmas. The new mappings are
* also safe across swapout.
*
- * NOTE: the 'prot' parameter right now is ignored, and the vma's default
- * protection is used. Arbitrary protections might be implemented in the
- * future.
+ * NOTE: the @prot parameter right now is ignored (but must be zero),
+ * and the vma's default protection is used. Arbitrary protections
+ * might be implemented in the future.
*/
-asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
- unsigned long __prot, unsigned long pgoff, unsigned long flags)
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
- unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
+ vm_flags_t vm_flags = 0;
- if (__prot)
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+ "See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
+
+ if (prot)
return err;
/*
* Sanitize the syscall parameters:
@@ -168,6 +165,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
if (start + size <= start)
return err;
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return err;
+
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -182,49 +183,101 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
- * the single existing vma. vm_private_data is used as a
- * swapout cursor in a VM_NONLINEAR vma.
+ * the single existing vma.
*/
- if (vma && (vma->vm_flags & VM_SHARED) &&
- (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
- vma->vm_ops && vma->vm_ops->populate &&
- end > start && start >= vma->vm_start &&
- end <= vma->vm_end) {
-
- /* Must set VM_NONLINEAR before any pages are populated. */
- if (pgoff != linear_page_index(vma, start) &&
- !(vma->vm_flags & VM_NONLINEAR)) {
- if (!has_write_lock) {
- up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
- has_write_lock = 1;
- goto retry;
- }
- mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
- flush_dcache_mmap_lock(mapping);
- vma->vm_flags |= VM_NONLINEAR;
- vma_prio_tree_remove(vma, &mapping->i_mmap);
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (!vma->vm_ops || !vma->vm_ops->remap_pages)
+ goto out;
+
+ if (start < vma->vm_start || start + size > vma->vm_end)
+ goto out;
+
+ /* Must set VM_NONLINEAR before any pages are populated. */
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
+ /*
+ * vm_private_data is used as a swapout cursor
+ * in a VM_NONLINEAR vma.
+ */
+ if (vma->vm_private_data)
+ goto out;
+
+ /* Don't need a nonlinear mapping, exit success */
+ if (pgoff == linear_page_index(vma, start)) {
+ err = 0;
+ goto out;
+ }
+
+ if (!has_write_lock) {
+get_write_lock:
+ up_read(&mm->mmap_sem);
+ down_write(&mm->mmap_sem);
+ has_write_lock = 1;
+ goto retry;
}
+ mapping = vma->vm_file->f_mapping;
+ /*
+ * page_mkclean doesn't work on nonlinear vmas, so if
+ * dirty pages need to be accounted, emulate with linear
+ * vmas.
+ */
+ if (mapping_cap_account_dirty(mapping)) {
+ unsigned long addr;
+ struct file *file = get_file(vma->vm_file);
+ /* mmap_region may free vma; grab the info now */
+ vm_flags = vma->vm_flags;
- err = vma->vm_ops->populate(vma, start, size,
- vma->vm_page_prot,
- pgoff, flags & MAP_NONBLOCK);
+ addr = mmap_region(file, start, size, vm_flags, pgoff);
+ fput(file);
+ if (IS_ERR_VALUE(addr)) {
+ err = addr;
+ } else {
+ BUG_ON(addr != start);
+ err = 0;
+ }
+ goto out_freed;
+ }
+ mutex_lock(&mapping->i_mmap_mutex);
+ flush_dcache_mmap_lock(mapping);
+ vma->vm_flags |= VM_NONLINEAR;
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+ flush_dcache_mmap_unlock(mapping);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ }
+ if (vma->vm_flags & VM_LOCKED) {
/*
- * We can't clear VM_NONLINEAR because we'd have to do
- * it after ->populate completes, and that would prevent
- * downgrading the lock. (Locks can't be upgraded).
+ * drop PG_Mlocked flag for over-mapped range
*/
+ if (!has_write_lock)
+ goto get_write_lock;
+ vm_flags = vma->vm_flags;
+ munlock_vma_pages_range(vma, start, start + size);
+ vma->vm_flags = vm_flags;
}
+
+ mmu_notifier_invalidate_range_start(mm, start, start + size);
+ err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
+ mmu_notifier_invalidate_range_end(mm, start, start + size);
+
+ /*
+ * We can't clear VM_NONLINEAR because we'd have to do
+ * it after ->populate completes, and that would prevent
+ * downgrading the lock. (Locks can't be upgraded).
+ */
+
+out:
+ if (vma)
+ vm_flags = vma->vm_flags;
+out_freed:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
up_write(&mm->mmap_sem);
+ if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
+ mm_populate(start, size);
return err;
}
-
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 00000000000..c30eec536f0
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,455 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops *frontswap_ops __read_mostly;
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+
+/*
+ * Due to the asynchronous nature of the backends loading potentially
+ * _after_ the swap system has been activated, we have chokepoints
+ * on all frontswap functions to not call the backend until the backend
+ * has registered.
+ *
+ * Specifically when no backend is registered (nobody called
+ * frontswap_register_ops) all calls to frontswap_init (which is done via
+ * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
+ * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
+ * backend registers with frontswap at some later point the previous
+ * calls to frontswap_init are executed (by iterating over the need_init
+ * bitmap) to create tmem_pools and set the respective poolids. All of that is
+ * guarded by us using atomic bit operations on the 'need_init' bitmap.
+ *
+ * This would not guards us against the user deciding to call swapoff right as
+ * we are calling the backend to initialize (so swapon is in action).
+ * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
+ * OK. The other scenario where calls to frontswap_store (called via
+ * swap_writepage) is racing with frontswap_invalidate_area (called via
+ * swapoff) is again guarded by the swap subsystem.
+ *
+ * While no backend is registered all calls to frontswap_[store|load|
+ * invalidate_area|invalidate_page] are ignored or fail.
+ *
+ * The time between the backend being registered and the swap file system
+ * calling the backend (via the frontswap_* functions) is indeterminate as
+ * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
+ * That is OK as we are comfortable missing some of these calls to the newly
+ * registered backend.
+ *
+ * Obviously the opposite (unloading the backend) must be done after all
+ * the frontswap_[store|load|invalidate_area|invalidate_page] start
+ * ignorning or failing the requests - at which point frontswap_ops
+ * would have to be made in some fashion atomic.
+ */
+static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
+
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops *old = frontswap_ops;
+ int i;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ if (test_and_clear_bit(i, need_init)) {
+ struct swap_info_struct *sis = swap_info[i];
+ /* __frontswap_init _should_ have set it! */
+ if (!sis->frontswap_map)
+ return ERR_PTR(-EINVAL);
+ ops->init(i);
+ }
+ }
+ /*
+ * We MUST have frontswap_ops set _after_ the frontswap_init's
+ * have been called. Otherwise __frontswap_store might fail. Hence
+ * the barrier to make sure compiler does not re-order us.
+ */
+ barrier();
+ frontswap_ops = ops;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+ frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type, unsigned long *map)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+
+ /*
+ * p->frontswap is a bitmap that we MUST have to figure out which page
+ * has gone in frontswap. Without it there is no point of continuing.
+ */
+ if (WARN_ON(!map))
+ return;
+ /*
+ * Irregardless of whether the frontswap backend has been loaded
+ * before this function or it will be later, we _MUST_ have the
+ * p->frontswap set to something valid to work properly.
+ */
+ frontswap_map_set(sis, map);
+ if (frontswap_ops)
+ frontswap_ops->init(type);
+ else {
+ BUG_ON(type > MAX_SWAPFILES);
+ set_bit(type, need_init);
+ }
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+bool __frontswap_test(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ bool ret = false;
+
+ if (frontswap_ops && sis->frontswap_map)
+ ret = test_bit(offset, sis->frontswap_map);
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_test);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ clear_bit(offset, sis->frontswap_map);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ /*
+ * Return if no backend registed.
+ * Don't need to inc frontswap_failed_stores here.
+ */
+ if (!frontswap_ops)
+ return ret;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (__frontswap_test(sis, offset))
+ dup = 1;
+ ret = frontswap_ops->store(type, offset, page);
+ if (ret == 0) {
+ set_bit(offset, sis->frontswap_map);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ inc_frontswap_failed_stores();
+ if (dup)
+ __frontswap_clear(sis, offset);
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ /*
+ * __frontswap_test() will check whether there is backend registered
+ */
+ if (__frontswap_test(sis, offset))
+ ret = frontswap_ops->load(type, offset, page);
+ if (ret == 0) {
+ inc_frontswap_loads();
+ if (frontswap_tmem_exclusive_gets_enabled) {
+ SetPageDirty(page);
+ __frontswap_clear(sis, offset);
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ /*
+ * __frontswap_test() will check whether there is backend registered
+ */
+ if (__frontswap_test(sis, offset)) {
+ frontswap_ops->invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ if (frontswap_ops) {
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops->invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ bitmap_zero(sis->frontswap_map, sis->max);
+ }
+ clear_bit(type, need_init);
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list)
+ totalpages += atomic_read(&si->frontswap_pages);
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+
+ assert_spin_locked(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = si->type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 1;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int uninitialized_var(type), ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_active_head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/gup.c b/mm/gup.c
new file mode 100644
index 00000000000..cc5a9e7adea
--- /dev/null
+++ b/mm/gup.c
@@ -0,0 +1,662 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static struct page *no_page_table(struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ /*
+ * When core dumping an enormous anonymous area that nobody
+ * has touched so far, we don't want to allocate unnecessary pages or
+ * page tables. Return error instead of NULL to skip handle_mm_fault,
+ * then get_dump_page() will return NULL to leave a hole in the dump.
+ * But we can only make this optimization where a hole would surely
+ * be zero-filled if handle_mm_fault() actually did handle it.
+ */
+ if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault))
+ return ERR_PTR(-EFAULT);
+ return NULL;
+}
+
+static struct page *follow_page_pte(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+
+retry:
+ if (unlikely(pmd_bad(*pmd)))
+ return no_page_table(vma, flags);
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = *ptep;
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+ /*
+ * KSM's break_ksm() relies upon recognizing a ksm page
+ * even while it is being migrated, so for that case we
+ * need migration_entry_wait().
+ */
+ if (likely(!(flags & FOLL_MIGRATION)))
+ goto no_page;
+ if (pte_none(pte) || pte_file(pte))
+ goto no_page;
+ entry = pte_to_swp_entry(pte);
+ if (!is_migration_entry(entry))
+ goto no_page;
+ pte_unmap_unlock(ptep, ptl);
+ migration_entry_wait(mm, pmd, address);
+ goto retry;
+ }
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
+ if ((flags & FOLL_WRITE) && !pte_write(pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ return NULL;
+ }
+
+ page = vm_normal_page(vma, address, pte);
+ if (unlikely(!page)) {
+ if ((flags & FOLL_DUMP) ||
+ !is_zero_pfn(pte_pfn(pte)))
+ goto bad_page;
+ page = pte_page(pte);
+ }
+
+ if (flags & FOLL_GET)
+ get_page_foll(page);
+ if (flags & FOLL_TOUCH) {
+ if ((flags & FOLL_WRITE) &&
+ !pte_dirty(pte) && !PageDirty(page))
+ set_page_dirty(page);
+ /*
+ * pte_mkyoung() would be more correct here, but atomic care
+ * is needed to avoid losing the dirty bit: it is easier to use
+ * mark_page_accessed().
+ */
+ mark_page_accessed(page);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+ pte_unmap_unlock(ptep, ptl);
+ return page;
+bad_page:
+ pte_unmap_unlock(ptep, ptl);
+ return ERR_PTR(-EFAULT);
+
+no_page:
+ pte_unmap_unlock(ptep, ptl);
+ if (!pte_none(pte))
+ return NULL;
+ return no_page_table(vma, flags);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+
+ *page_mask = 0;
+
+ page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+ if (!IS_ERR(page)) {
+ BUG_ON(flags & FOLL_GET);
+ return page;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ return no_page_table(vma, flags);
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ return no_page_table(vma, flags);
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ if (flags & FOLL_GET)
+ return NULL;
+ page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+ return page;
+ }
+ if (unlikely(pud_bad(*pud)))
+ return no_page_table(vma, flags);
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return no_page_table(vma, flags);
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+ if (flags & FOLL_GET) {
+ /*
+ * Refcount on tail pages are not well-defined and
+ * shouldn't be taken. The caller should handle a NULL
+ * return when trying to follow tail pages.
+ */
+ if (PageHead(page))
+ get_page(page);
+ else
+ page = NULL;
+ }
+ return page;
+ }
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ return no_page_table(vma, flags);
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(vma, address, pmd);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ ptl = pmd_lock(mm, pmd);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(ptl);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(vma, address,
+ pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
+ }
+ } else
+ spin_unlock(ptl);
+ }
+ return follow_page_pte(vma, address, pmd, flags);
+}
+
+static int get_gate_page(struct mm_struct *mm, unsigned long address,
+ unsigned int gup_flags, struct vm_area_struct **vma,
+ struct page **page)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int ret = -EFAULT;
+
+ /* user gate pages are read-only */
+ if (gup_flags & FOLL_WRITE)
+ return -EFAULT;
+ if (address > TASK_SIZE)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = pgd_offset_gate(mm, address);
+ BUG_ON(pgd_none(*pgd));
+ pud = pud_offset(pgd, address);
+ BUG_ON(pud_none(*pud));
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ pte = pte_offset_map(pmd, address);
+ if (pte_none(*pte))
+ goto unmap;
+ *vma = get_gate_vma(mm);
+ if (!page)
+ goto out;
+ *page = vm_normal_page(*vma, address, *pte);
+ if (!*page) {
+ if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
+ goto unmap;
+ *page = pte_page(*pte);
+ }
+ get_page(*page);
+out:
+ ret = 0;
+unmap:
+ pte_unmap(pte);
+ return ret;
+}
+
+static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int fault_flags = 0;
+ int ret;
+
+ /* For mlock, just skip the stack guard page. */
+ if ((*flags & FOLL_MLOCK) &&
+ (stack_guard_page_start(vma, address) ||
+ stack_guard_page_end(vma, address + PAGE_SIZE)))
+ return -ENOENT;
+ if (*flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (*flags & FOLL_NOWAIT)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ return -EBUSY;
+ }
+
+ /*
+ * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
+ * necessary, even if maybe_mkwrite decided not to set pte_write. We
+ * can thus safely do subsequent page lookups as if they were reads.
+ * But only do so when looping for pte_write is futile: in some cases
+ * userspace may also be wanting to write to the gotten user page,
+ * which a read fault here might prevent (a readonly page might get
+ * reCOWed by userspace write).
+ */
+ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
+ *flags &= ~FOLL_WRITE;
+ return 0;
+}
+
+static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+
+ if (vm_flags & (VM_IO | VM_PFNMAP))
+ return -EFAULT;
+
+ if (gup_flags & FOLL_WRITE) {
+ if (!(vm_flags & VM_WRITE)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * We used to let the write,force case do COW in a
+ * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
+ * set a breakpoint in a read-only mapping of an
+ * executable, without corrupting the file (yet only
+ * when that file had been opened for writing!).
+ * Anon pages in shared mappings are surprising: now
+ * just reject it.
+ */
+ if (!is_cow_mapping(vm_flags)) {
+ WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+ return -EFAULT;
+ }
+ }
+ } else if (!(vm_flags & VM_READ)) {
+ if (!(gup_flags & FOLL_FORCE))
+ return -EFAULT;
+ /*
+ * Is there actually any vma we can reach here which does not
+ * have VM_MAYREAD set?
+ */
+ if (!(vm_flags & VM_MAYREAD))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *nonblocking)
+{
+ long i = 0;
+ unsigned int page_mask;
+ struct vm_area_struct *vma = NULL;
+
+ if (!nr_pages)
+ return 0;
+
+ VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+
+ /*
+ * If FOLL_FORCE is set then do not force a full fault as the hinting
+ * fault information is unrelated to the reference behaviour of a task
+ * using the address space
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
+ do {
+ struct page *page;
+ unsigned int foll_flags = gup_flags;
+ unsigned int page_increm;
+
+ /* first iteration or cross vma bound */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_extend_vma(mm, start);
+ if (!vma && in_gate_area(mm, start)) {
+ int ret;
+ ret = get_gate_page(mm, start & PAGE_MASK,
+ gup_flags, &vma,
+ pages ? &pages[i] : NULL);
+ if (ret)
+ return i ? : ret;
+ page_mask = 0;
+ goto next_page;
+ }
+
+ if (!vma || check_vma_flags(vma, gup_flags))
+ return i ? : -EFAULT;
+ if (is_vm_hugetlb_page(vma)) {
+ i = follow_hugetlb_page(mm, vma, pages, vmas,
+ &start, &nr_pages, i,
+ gup_flags);
+ continue;
+ }
+ }
+retry:
+ /*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current)))
+ return i ? i : -ERESTARTSYS;
+ cond_resched();
+ page = follow_page_mask(vma, start, foll_flags, &page_mask);
+ if (!page) {
+ int ret;
+ ret = faultin_page(tsk, vma, start, &foll_flags,
+ nonblocking);
+ switch (ret) {
+ case 0:
+ goto retry;
+ case -EFAULT:
+ case -ENOMEM:
+ case -EHWPOISON:
+ return i ? i : ret;
+ case -EBUSY:
+ return i;
+ case -ENOENT:
+ goto next_page;
+ }
+ BUG();
+ }
+ if (IS_ERR(page))
+ return i ? i : PTR_ERR(page);
+ if (pages) {
+ pages[i] = page;
+ flush_anon_page(vma, page, start);
+ flush_dcache_page(page);
+ page_mask = 0;
+ }
+next_page:
+ if (vmas) {
+ vmas[i] = vma;
+ page_mask = 0;
+ }
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ if (page_increm > nr_pages)
+ page_increm = nr_pages;
+ i += page_increm;
+ start += page_increm * PAGE_SIZE;
+ nr_pages -= page_increm;
+ } while (nr_pages);
+ return i;
+}
+EXPORT_SYMBOL(__get_user_pages);
+
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ vm_flags_t vm_flags;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
+ if (!(vm_flags & vma->vm_flags))
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
+ * get_user_pages() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to by the caller
+ * @force: whether to force access even when user mapping is currently
+ * protected (but never forces write access to shared mapping).
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If write=0, the page must not be written to. If the page is written to,
+ * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
+ * after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ */
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages, int write,
+ int force, struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = FOLL_TOUCH;
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by page_cache_release() or put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_sem, but after all other threads have been killed.
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ if (__get_user_pages(current, current->mm, addr, 1,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
+ return NULL;
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ return page;
+}
+#endif /* CONFIG_ELF_CORE */
diff --git a/mm/highmem.c b/mm/highmem.c
index ee5519b176e..b32b70cdaed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
*/
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
@@ -26,15 +26,13 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/kgdb.h>
#include <asm/tlbflush.h>
-static mempool_t *page_pool, *isa_page_pool;
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
- return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
/*
* Virtual_count is not a pure "count".
@@ -47,14 +45,24 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
#ifdef CONFIG_HIGHMEM
unsigned long totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(totalhigh_pages);
+
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
unsigned int pages = 0;
- for_each_online_pgdat(pgdat)
- pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ for_each_online_pgdat(pgdat) {
+ pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+ NR_FREE_PAGES);
+ if (zone_movable_is_highmem())
+ pages += zone_page_state(
+ &pgdat->node_zones[ZONE_MOVABLE],
+ NR_FREE_PAGES);
+ }
return pages;
}
@@ -67,9 +75,42 @@ pte_t * pkmap_page_table;
static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
+/*
+ * Most architectures have no use for kmap_high_get(), so let's abstract
+ * the disabling of IRQ out of the locking in that case to save on a
+ * potential useless overhead.
+ */
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+#define lock_kmap() spin_lock_irq(&kmap_lock)
+#define unlock_kmap() spin_unlock_irq(&kmap_lock)
+#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
+#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
+#else
+#define lock_kmap() spin_lock(&kmap_lock)
+#define unlock_kmap() spin_unlock(&kmap_lock)
+#define lock_kmap_any(flags) \
+ do { spin_lock(&kmap_lock); (void)(flags); } while (0)
+#define unlock_kmap_any(flags) \
+ do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
+#endif
+
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
+ int i = PKMAP_NR(addr);
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+EXPORT_SYMBOL(kmap_to_page);
+
static void flush_all_zero_pkmaps(void)
{
int i;
+ int need_flush = 0;
flush_cache_kmaps();
@@ -97,12 +138,23 @@ static void flush_all_zero_pkmaps(void)
* So no dangers, even with speculative execution.
*/
page = pte_page(pkmap_page_table[i]);
- pte_clear(&init_mm, (unsigned long)page_address(page),
- &pkmap_page_table[i]);
+ pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
+ need_flush = 1;
}
- flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+ if (need_flush)
+ flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+}
+
+/**
+ * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
+ */
+void kmap_flush_unused(void)
+{
+ lock_kmap();
+ flush_all_zero_pkmaps();
+ unlock_kmap();
}
static inline unsigned long map_new_virtual(struct page *page)
@@ -132,10 +184,10 @@ start:
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&pkmap_map_wait, &wait);
- spin_unlock(&kmap_lock);
+ unlock_kmap();
schedule();
remove_wait_queue(&pkmap_map_wait, &wait);
- spin_lock(&kmap_lock);
+ lock_kmap();
/* Somebody else might have mapped it while we slept */
if (page_address(page))
@@ -155,35 +207,75 @@ start:
return vaddr;
}
-void fastcall *kmap_high(struct page *page)
+/**
+ * kmap_high - map a highmem page into memory
+ * @page: &struct page to map
+ *
+ * Returns the page's virtual memory address.
+ *
+ * We cannot call this from interrupts, as it may block.
+ */
+void *kmap_high(struct page *page)
{
unsigned long vaddr;
/*
* For highmem pages, we can't trust "virtual" until
* after we have the lock.
- *
- * We cannot call this from interrupts, as it may block
*/
- spin_lock(&kmap_lock);
+ lock_kmap();
vaddr = (unsigned long)page_address(page);
if (!vaddr)
vaddr = map_new_virtual(page);
pkmap_count[PKMAP_NR(vaddr)]++;
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
- spin_unlock(&kmap_lock);
+ unlock_kmap();
return (void*) vaddr;
}
EXPORT_SYMBOL(kmap_high);
-void fastcall kunmap_high(struct page *page)
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+/**
+ * kmap_high_get - pin a highmem page into memory
+ * @page: &struct page to pin
+ *
+ * Returns the page's current virtual memory address, or NULL if no mapping
+ * exists. If and only if a non null address is returned then a
+ * matching call to kunmap_high() is necessary.
+ *
+ * This can be called from any context.
+ */
+void *kmap_high_get(struct page *page)
+{
+ unsigned long vaddr, flags;
+
+ lock_kmap_any(flags);
+ vaddr = (unsigned long)page_address(page);
+ if (vaddr) {
+ BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
+ pkmap_count[PKMAP_NR(vaddr)]++;
+ }
+ unlock_kmap_any(flags);
+ return (void*) vaddr;
+}
+#endif
+
+/**
+ * kunmap_high - unmap a highmem page into memory
+ * @page: &struct page to unmap
+ *
+ * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
+ * only from user context.
+ */
+void kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
+ unsigned long flags;
int need_wakeup;
- spin_lock(&kmap_lock);
+ lock_kmap_any(flags);
vaddr = (unsigned long)page_address(page);
BUG_ON(!vaddr);
nr = PKMAP_NR(vaddr);
@@ -209,7 +301,7 @@ void fastcall kunmap_high(struct page *page)
*/
need_wakeup = waitqueue_active(&pkmap_map_wait);
}
- spin_unlock(&kmap_lock);
+ unlock_kmap_any(flags);
/* do wake-up, if needed, race-free outside of the spin lock */
if (need_wakeup)
@@ -217,282 +309,8 @@ void fastcall kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-
-#define POOL_SIZE 64
-
-static __init int init_emergency_pool(void)
-{
- struct sysinfo i;
- si_meminfo(&i);
- si_swapinfo(&i);
-
- if (!i.totalhigh)
- return 0;
-
- page_pool = mempool_create_page_pool(POOL_SIZE, 0);
- BUG_ON(!page_pool);
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
-
- return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
- unsigned long flags;
- unsigned char *vto;
-
- local_irq_save(flags);
- vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
- memcpy(vto + to->bv_offset, vfrom, to->bv_len);
- kunmap_atomic(vto, KM_BOUNCE_READ);
- local_irq_restore(flags);
-}
-
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom) \
- memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
#endif
-#define ISA_POOL_SIZE 16
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
- if (isa_page_pool)
- return 0;
-
- isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
- mempool_free_pages, (void *) 0);
- BUG_ON(!isa_page_pool);
-
- printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
- return 0;
-}
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
- unsigned char *vfrom;
- struct bio_vec *tovec, *fromvec;
- int i;
-
- __bio_for_each_segment(tovec, to, i, 0) {
- fromvec = from->bi_io_vec + i;
-
- /*
- * not bounced
- */
- if (tovec->bv_page == fromvec->bv_page)
- continue;
-
- /*
- * fromvec->bv_offset and fromvec->bv_len might have been
- * modified by the block layer, so use the original copy,
- * bounce_copy_vec already uses tovec->bv_len
- */
- vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
-
- flush_dcache_page(tovec->bv_page);
- bounce_copy_vec(tovec, vfrom);
- }
-}
-
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
-{
- struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, *org_vec;
- int i;
-
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
- /*
- * free up bounce indirect pages used
- */
- __bio_for_each_segment(bvec, bio, i, 0) {
- org_vec = bio_orig->bi_io_vec + i;
- if (bvec->bv_page == org_vec->bv_page)
- continue;
-
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, pool);
- }
-
- bio_endio(bio_orig, bio_orig->bi_size, err);
- bio_put(bio);
-}
-
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- bounce_end_io(bio, page_pool, err);
- return 0;
-}
-
-static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- bounce_end_io(bio, isa_page_pool, err);
- return 0;
-}
-
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
-{
- struct bio *bio_orig = bio->bi_private;
-
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- copy_to_high_bio_irq(bio_orig, bio);
-
- bounce_end_io(bio, pool, err);
-}
-
-static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- __bounce_end_io_read(bio, page_pool, err);
- return 0;
-}
-
-static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
- if (bio->bi_size)
- return 1;
-
- __bounce_end_io_read(bio, isa_page_pool, err);
- return 0;
-}
-
-static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
- mempool_t *pool)
-{
- struct page *page;
- struct bio *bio = NULL;
- int i, rw = bio_data_dir(*bio_orig);
- struct bio_vec *to, *from;
-
- bio_for_each_segment(from, *bio_orig, i) {
- page = from->bv_page;
-
- /*
- * is destination page below bounce pfn?
- */
- if (page_to_pfn(page) < q->bounce_pfn)
- continue;
-
- /*
- * irk, bounce it
- */
- if (!bio)
- bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
-
- to = bio->bi_io_vec + i;
-
- to->bv_page = mempool_alloc(pool, q->bounce_gfp);
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
- inc_zone_page_state(to->bv_page, NR_BOUNCE);
-
- if (rw == WRITE) {
- char *vto, *vfrom;
-
- flush_dcache_page(from->bv_page);
- vto = page_address(to->bv_page) + to->bv_offset;
- vfrom = kmap(from->bv_page) + from->bv_offset;
- memcpy(vto, vfrom, to->bv_len);
- kunmap(from->bv_page);
- }
- }
-
- /*
- * no pages bounced
- */
- if (!bio)
- return;
-
- /*
- * at least one page was bounced, fill in possible non-highmem
- * pages
- */
- __bio_for_each_segment(from, *bio_orig, i, 0) {
- to = bio_iovec_idx(bio, i);
- if (!to->bv_page) {
- to->bv_page = from->bv_page;
- to->bv_len = from->bv_len;
- to->bv_offset = from->bv_offset;
- }
- }
-
- bio->bi_bdev = (*bio_orig)->bi_bdev;
- bio->bi_flags |= (1 << BIO_BOUNCED);
- bio->bi_sector = (*bio_orig)->bi_sector;
- bio->bi_rw = (*bio_orig)->bi_rw;
-
- bio->bi_vcnt = (*bio_orig)->bi_vcnt;
- bio->bi_idx = (*bio_orig)->bi_idx;
- bio->bi_size = (*bio_orig)->bi_size;
-
- if (pool == page_pool) {
- bio->bi_end_io = bounce_end_io_write;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- } else {
- bio->bi_end_io = bounce_end_io_write_isa;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read_isa;
- }
-
- bio->bi_private = *bio_orig;
- *bio_orig = bio;
-}
-
-void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
-{
- mempool_t *pool;
-
- /*
- * for non-isa bounce case, just check if the bounce pfn is equal
- * to or bigger than the highest pfn in the system -- in that case,
- * don't waste time iterating over bio segments
- */
- if (!(q->bounce_gfp & GFP_DMA)) {
- if (q->bounce_pfn >= blk_max_pfn)
- return;
- pool = page_pool;
- } else {
- BUG_ON(!isa_page_pool);
- pool = isa_page_pool;
- }
-
- blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
- /*
- * slow path
- */
- __blk_queue_bounce(q, bio_orig, pool);
-}
-
-EXPORT_SYMBOL(blk_queue_bounce);
-
#if defined(HASHED_PAGE_VIRTUAL)
#define PA_HASH_ORDER 7
@@ -506,11 +324,7 @@ struct page_address_map {
struct list_head list;
};
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool; /* freelist */
-static spinlock_t pool_lock; /* protects page_address_pool */
+static struct page_address_map page_address_maps[LAST_PKMAP];
/*
* Hash table bucket
@@ -520,12 +334,18 @@ static struct page_address_slot {
spinlock_t lock; /* Protect this bucket's list */
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
{
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
-void *page_address(struct page *page)
+/**
+ * page_address - get the mapped virtual address of a page
+ * @page: &struct page to get the virtual address of
+ *
+ * Returns the page's virtual address.
+ */
+void *page_address(const struct page *page)
{
unsigned long flags;
void *ret;
@@ -554,6 +374,11 @@ done:
EXPORT_SYMBOL(page_address);
+/**
+ * set_page_address - set a page's virtual address
+ * @page: &struct page to set
+ * @virtual: virtual address to use
+ */
void set_page_address(struct page *page, void *virtual)
{
unsigned long flags;
@@ -564,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
pas = page_slot(page);
if (virtual) { /* Add */
- BUG_ON(list_empty(&page_address_pool));
-
- spin_lock_irqsave(&pool_lock, flags);
- pam = list_entry(page_address_pool.next,
- struct page_address_map, list);
- list_del(&pam->list);
- spin_unlock_irqrestore(&pool_lock, flags);
-
+ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
pam->page = page;
pam->virtual = virtual;
@@ -584,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
if (pam->page == page) {
list_del(&pam->list);
spin_unlock_irqrestore(&pas->lock, flags);
- spin_lock_irqsave(&pool_lock, flags);
- list_add_tail(&pam->list, &page_address_pool);
- spin_unlock_irqrestore(&pool_lock, flags);
goto done;
}
}
@@ -596,20 +411,14 @@ done:
return;
}
-static struct page_address_map page_address_maps[LAST_PKMAP];
-
void __init page_address_init(void)
{
int i;
- INIT_LIST_HEAD(&page_address_pool);
- for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
- list_add(&page_address_maps[i].list, &page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
- spin_lock_init(&pool_lock);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 00000000000..33514d88fef
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2939 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/shrinker.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/migrate.h>
+#include <linux/hashtable.h>
+
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is disabled in order that avoid
+ * to risk increase the memory footprint of applications without a guaranteed
+ * benefit. When transparent hugepage support is enabled, is for all mappings,
+ * and khugepaged scans all mappings.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int khugepaged_slab_init(void);
+
+#define MM_SLOTS_HASH_BITS 10
+static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+};
+static struct khugepaged_scan khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+
+ if (!khugepaged_enabled())
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes) {
+ if (user_min_free_kbytes >= 0)
+ pr_info("raising min_free_kbytes from %d to %lu "
+ "to help transparent hugepage allocations\n",
+ min_free_kbytes, recommended_min);
+
+ min_free_kbytes = recommended_min;
+ }
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ pr_err("khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
+ if (!list_empty(&khugepaged_scan.mm_head))
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
+ return err;
+}
+
+static atomic_t huge_zero_refcount;
+static struct page *huge_zero_page __read_mostly;
+
+static inline bool is_huge_zero_page(struct page *page)
+{
+ return ACCESS_ONCE(huge_zero_page) == page;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_page(pmd_page(pmd));
+}
+
+static struct page *get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_page);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return NULL;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_page);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
+
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ struct page *zero_page = xchg(&huge_zero_page, NULL);
+ BUG_ON(zero_page == NULL);
+ __free_page(zero_page);
+ return HPAGE_PMD_NR;
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .count_objects = shrink_huge_zero_page_count,
+ .scan_objects = shrink_huge_zero_page_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err;
+
+ mutex_lock(&khugepaged_mutex);
+ err = start_khugepaged();
+ mutex_unlock(&khugepaged_mutex);
+
+ if (err)
+ ret = err;
+ }
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
+}
+
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
+ set_bit(flag, &transparent_hugepage_flags);
+ else
+ clear_bit(flag, &transparent_hugepage_flags);
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+ &use_zero_page_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = kstrtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = kstrtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+
+static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ int err;
+
+ *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!*hugepage_kobj)) {
+ pr_err("failed to create transparent hugepage kobject\n");
+ return -ENOMEM;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ pr_err("failed to register transparent hugepage group\n");
+ goto delete_obj;
+ }
+
+ err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ pr_err("failed to register transparent hugepage group\n");
+ goto remove_hp_group;
+ }
+
+ return 0;
+
+remove_hp_group:
+ sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
+delete_obj:
+ kobject_put(*hugepage_kobj);
+ return err;
+}
+
+static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+ sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
+ sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
+ kobject_put(hugepage_kobj);
+}
+#else
+static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+ return 0;
+}
+
+static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+ struct kobject *hugepage_kobj;
+
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ return -EINVAL;
+ }
+
+ err = hugepage_init_sysfs(&hugepage_kobj);
+ if (err)
+ return err;
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ register_shrinker(&huge_zero_page_shrinker);
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ return 0;
+out:
+ hugepage_exit_sysfs(hugepage_kobj);
+ return err;
+}
+subsys_initcall(hugepage_init);
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
+{
+ pmd_t entry;
+ entry = mk_pmd(page, prot);
+ entry = pmd_mkhuge(entry);
+ return entry;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ pgtable_t pgtable;
+ spinlock_t *ptl;
+
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * clear_huge_page writes become visible before the set_pmd_at()
+ * write.
+ */
+ __SetPageUptodate(page);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(ptl);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, haddr, pmd, entry);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&mm->nr_ptes);
+ spin_unlock(ptl);
+ }
+
+ return 0;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+{
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+
+/* Caller must hold page table lock. */
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ struct page *zero_page)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, haddr, pmd, entry);
+ atomic_long_inc(&mm->nr_ptes);
+ return true;
+}
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+
+ if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ struct page *zero_page;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_page = get_huge_zero_page();
+ if (unlikely(!zero_page)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ ptl = pmd_lock(mm, pmd);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC);
+ return 0;
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ spinlock_t *dst_ptl, *src_ptl;
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ dst_ptl = pmd_lock(dst_mm, dst_pmd);
+ src_ptl = pmd_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ /*
+ * When page table lock is held, the huge zero pmd should not be
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ struct page *zero_page;
+ bool set;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_page = get_huge_zero_page();
+ set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_page);
+ BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ atomic_long_inc(&dst_mm->nr_ptes);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+out:
+ return ret;
+}
+
+void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+ spinlock_t *ptl;
+ pmd_t entry;
+ unsigned long haddr;
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto unlock;
+
+ entry = pmd_mkyoung(orig_pmd);
+ haddr = address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+ update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+ spin_unlock(ptl);
+}
+
+/*
+ * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
+ * during copy_user_huge_page()'s copy_page_rep(): in the case when
+ * the source page gets split and a tail freed before copy completes.
+ * Called under pmd_lock of checked pmd, so safe from splitting itself.
+ */
+static void get_user_huge_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+ struct page *endpage = page + HPAGE_PMD_NR;
+
+ atomic_add(HPAGE_PMD_NR, &page->_count);
+ while (++page < endpage)
+ get_huge_page_tail(page);
+ } else {
+ get_page(page);
+ }
+}
+
+static void put_user_huge_page(struct page *page)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
+ struct page *endpage = page + HPAGE_PMD_NR;
+
+ while (page < endpage)
+ put_page(page++);
+ } else {
+ put_page(page);
+ }
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
+ if (unlikely(!pages[i] ||
+ mem_cgroup_charge_anon(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SIZE * i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(ptl);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ spinlock_t *ptl;
+ int ret = 0;
+ struct page *page = NULL, *new_page;
+ unsigned long haddr;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ ptl = pmd_lockptr(mm, pmd);
+ VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache_pmd(vma, address, pmd);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_user_huge_page(page);
+ spin_unlock(ptl);
+alloc:
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ if (!page) {
+ split_huge_page_pmd(vma, address, pmd);
+ ret |= VM_FAULT_FALLBACK;
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM) {
+ split_huge_page(page);
+ ret |= VM_FAULT_FALLBACK;
+ }
+ put_user_huge_page(page);
+ }
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ if (page) {
+ split_huge_page(page);
+ put_user_huge_page(page);
+ } else
+ split_huge_page_pmd(vma, address, pmd);
+ ret |= VM_FAULT_FALLBACK;
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+
+ count_vm_event(THP_FAULT_ALLOC);
+
+ if (!page)
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(ptl);
+ if (page)
+ put_user_huge_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ spin_unlock(ptl);
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ goto out_mn;
+ } else {
+ pmd_t entry;
+ entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ pmdp_clear_flush(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, pmd);
+ if (!page) {
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ } else {
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ page_remove_rmap(page);
+ put_page(page);
+ }
+ ret |= VM_FAULT_WRITE;
+ }
+ spin_unlock(ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return ret;
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ /* Avoid dumping huge zero page */
+ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+ return ERR_PTR(-EFAULT);
+
+ /* Full NUMA hinting faults to serialise migration in fault paths */
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain();
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (flags & FOLL_GET)
+ get_page_foll(page);
+
+out:
+ return page;
+}
+
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ spinlock_t *ptl;
+ struct anon_vma *anon_vma = NULL;
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int page_nid = -1, this_nid = numa_node_id();
+ int target_nid, last_cpupid = -1;
+ bool page_locked;
+ bool migrated = false;
+ int flags = 0;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ /*
+ * If there are potential migrations, wait for completion and retry
+ * without disrupting NUMA hinting information. Do not relock and
+ * check_same as the page may no longer be mapped.
+ */
+ if (unlikely(pmd_trans_migrating(*pmdp))) {
+ spin_unlock(ptl);
+ wait_migrate_huge_page(vma->anon_vma, pmdp);
+ goto out;
+ }
+
+ page = pmd_page(pmd);
+ BUG_ON(is_huge_zero_page(page));
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (page_nid == this_nid) {
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pmd_write(pmd))
+ flags |= TNF_NO_GROUP;
+
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+ }
+
+ /* Migration could have started since the pmd_trans_migrating check */
+ if (!page_locked) {
+ spin_unlock(ptl);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
+ }
+
+ /*
+ * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
+ * to serialises splits
+ */
+ get_page(page);
+ spin_unlock(ptl);
+ anon_vma = page_lock_anon_vma_read(page);
+
+ /* Confirm the PMD did not change while page_table_lock was released */
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ page_nid = -1;
+ goto out_unlock;
+ }
+
+ /* Bail if we fail to protect against THP splits for any reason */
+ if (unlikely(!anon_vma)) {
+ put_page(page);
+ page_nid = -1;
+ goto clear_pmdnuma;
+ }
+
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(ptl);
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr, page, target_nid);
+ if (migrated) {
+ flags |= TNF_MIGRATED;
+ page_nid = target_nid;
+ }
+
+ goto out;
+clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
+out_unlock:
+ spin_unlock(ptl);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+
+ return 0;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+{
+ spinlock_t *ptl;
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t orig_pmd;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
+ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ put_huge_zero_page();
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ tlb_remove_page(tlb, page);
+ }
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ spinlock_t *ptl;
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ spin_unlock(ptl);
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ int ret = 0;
+ pmd_t pmd;
+
+ struct mm_struct *mm = vma->vm_mm;
+
+ if ((old_addr & ~HPAGE_PMD_MASK) ||
+ (new_addr & ~HPAGE_PMD_MASK) ||
+ old_end - old_addr < HPAGE_PMD_SIZE ||
+ (new_vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ goto out;
+ }
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
+ if (ret == 1) {
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+
+ if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ pgtable_t pgtable;
+ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+ pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+ }
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+ }
+out:
+ return ret;
+}
+
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ pmd_t entry;
+ ret = 1;
+ if (!prot_numa) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
+ entry = pmd_modify(entry, newprot);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /*
+ * Do not trap faults against the zero page. The
+ * read-only data is likely to be read-cached on the
+ * local CPU cache and it is less useful to know about
+ * local vs remote hits on the zero page.
+ */
+ if (!is_huge_zero_page(page) &&
+ !pmd_numa(*pmd)) {
+ pmdp_set_numa(mm, addr, pmd);
+ ret = HPAGE_PMD_NR;
+ }
+ }
+ spin_unlock(ptl);
+ }
+
+ return ret;
+}
+
+/*
+ * Returns 1 if a given pmd maps a stable (not under splitting) thp.
+ * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ *
+ * Note that if it returns 1, this routine returns without unlocking page
+ * table locks. So callers must unlock them.
+ */
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ spinlock_t **ptl)
+{
+ *ptl = pmd_lock(vma->vm_mm, pmd);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(*ptl);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ return -1;
+ } else {
+ /* Thp mapped by 'pmd' is stable, so we can
+ * handle it as it is. */
+ return 1;
+ }
+ }
+ spin_unlock(*ptl);
+ return 0;
+}
+
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag,
+ spinlock_t **ptl)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (address & ~HPAGE_PMD_MASK)
+ return NULL;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return NULL;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+ pmd = pmd_offset(pud, address);
+
+ *ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock;
+ if (pmd_page(*pmd) != page)
+ goto unlock;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto unlock;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ return pmd;
+ }
+unlock:
+ spin_unlock(*ptl);
+ return NULL;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ int ret = 0;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = address;
+ const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
+
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->rwsem to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush(vma, address, pmd);
+ ret = 1;
+ spin_unlock(ptl);
+ }
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page,
+ struct list_head *list)
+{
+ int i;
+ struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+ int tail_count = 0;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ compound_lock(page);
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(page);
+
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_mapcount cannot change */
+ BUG_ON(page_mapcount(page_tail) < 0);
+ tail_count += page_mapcount(page_tail);
+ /* check for overflow */
+ BUG_ON(tail_count < 0);
+ BUG_ON(atomic_read(&page_tail->_count) != 0);
+ /*
+ * tail_page->_count is zero and not changing from
+ * under us. But get_page_unless_zero() may be running
+ * from under us on the tail_page. If we used
+ * atomic_set() below instead of atomic_add(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is
+ * implemented in C not using locked ops. spin_unlock
+ * on x86 sometime uses locked ops because of PPro
+ * errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and
+ * not only on x86), it's safer to use atomic_add().
+ */
+ atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+ &page_tail->_count);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_unevictable)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /* clear PageTail before overwriting first_page */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = page->index + i;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ lru_add_page_tail(page, page_tail, lruvec, list);
+ }
+ atomic_sub(tail_count, &page->_count);
+ BUG_ON(atomic_read(&page->_count) <= 0);
+
+ __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
+ if (pmd) {
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ haddr = address;
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ pmdp_invalidate(vma, address, pmd);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ spin_unlock(ptl);
+ }
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->rwsem held */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma,
+ struct list_head *list)
+{
+ int mapcount, mapcount2;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page)) {
+ pr_err("mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG();
+ }
+
+ __split_huge_page_refcount(page, list);
+
+ mapcount2 = 0;
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2) {
+ pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG();
+ }
+}
+
+/*
+ * Split a hugepage into normal pages. This doesn't change the position of head
+ * page. If @list is null, tail pages will be added to LRU list, otherwise, to
+ * @list. Both head page and tail pages will inherit mapping, flags, and so on
+ * from the hugepage.
+ * Return 0 if the hugepage is split successfully otherwise return 1.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(is_huge_zero_page(page));
+ BUG_ON(!PageAnon(page));
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ anon_vma_lock_write(anon_vma);
+
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma, list);
+ count_vm_event(THP_SPLIT);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+ /*
+ * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+ * can't handle this properly after s390_enable_sie, so we simply
+ * ignore the madvise to prevent qemu from causing a SIGSEGV.
+ */
+ if (mm_has_pgste(vma->vm_mm))
+ return 0;
+#endif
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
+ if (mm == mm_slot->mm)
+ return mm_slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ if (free) {
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out;
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page))
+ goto out;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out;
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page))
+ goto out;
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (likely(referenced))
+ return 1;
+out:
+ release_pte_pages(pte, _pte);
+ return 0;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
+
+static int khugepaged_node_load[MAX_NUMNODES];
+
+#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ VM_BUG_ON_PAGE(*hpage, *hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ /*
+ * After allocating the hugepage, release the mmap_sem read lock in
+ * preparation for taking it in write mode.
+ */
+ up_read(&mm->mmap_sem);
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return NULL;
+ }
+
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ return *hpage;
+}
+#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ up_read(&mm->mmap_sem);
+ VM_BUG_ON(!*hpage);
+ return *hpage;
+}
+#endif
+
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ return false;
+
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ return true;
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
+{
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *pmd_ptl, *pte_ptl;
+ int isolated;
+ unsigned long hstart, hend;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* release the mmap_sem read lock. */
+ new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ if (!new_page)
+ return;
+
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
+ return;
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto out;
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+ if (!hugepage_vma_check(vma))
+ goto out;
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+
+ anon_vma_lock_write(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ pte_ptl = pte_lockptr(mm, pmd);
+
+ mmun_start = address;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ spin_lock(pte_ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(pte_ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ /*
+ * We can only use set_pmd_at when establishing
+ * hugepmds and never for establishing regular pmds that
+ * points to regular pagetables. Use pmd_populate for that
+ */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ spin_unlock(pmd_ptl);
+ anon_vma_unlock_write(vma->anon_vma);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock_write(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+
+ _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(pmd_ptl);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache_pmd(vma, address, pmd);
+ spin_unlock(pmd_ptl);
+
+ *hpage = NULL;
+
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+ mem_cgroup_uncharge_page(new_page);
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+ int node = NUMA_NO_NODE;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ goto out;
+
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ /*
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
+ */
+ node = page_to_nid(page);
+ khugepaged_node_load[node]++;
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret) {
+ node = khugepaged_find_target_node();
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node);
+ }
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hash_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+ if (!hugepage_vma_check(vma)) {
+skip:
+ progress++;
+ continue;
+ }
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ kthread_should_stop();
+}
+
+static void khugepaged_do_scan(void)
+{
+ struct page *hpage = NULL;
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ if (!khugepaged_prealloc_page(&hpage, &wait))
+ break;
+
+ cond_resched();
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ &hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
+}
+
+static void khugepaged_wait_work(void)
+{
+ try_to_freeze();
+
+ if (khugepaged_has_work()) {
+ if (!khugepaged_scan_sleep_millisecs)
+ return;
+
+ wait_event_freezable_timeout(khugepaged_wait,
+ kthread_should_stop(),
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+ return;
+ }
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, MAX_NICE);
+
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
+}
+
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
+{
+ spinlock_t *ptl;
+ struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+again:
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ get_page(page);
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ split_huge_page(page);
+
+ put_page(page);
+
+ /*
+ * We don't always have down_write of mmap_sem here: a racing
+ * do_huge_pmd_wp_page() might have copied-on-write to another
+ * huge page before our split_huge_page() got the anon_vma lock.
+ */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ goto again;
+}
+
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd_mm(mm, address, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c7d03dbf73..7a0a73d2fcf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,276 +1,2476 @@
/*
* Generic hugetlb support.
- * (C) William Irwin, April 2004
+ * (C) Nadia Yvette Chambers, April 2004
*/
-#include <linux/gfp.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
+#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/page-isolation.h>
+#include <linux/jhash.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/node.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-unsigned long max_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
+unsigned long hugepages_treat_as_movable;
+
+int hugetlb_max_hstate __read_mostly;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+
+__initdata LIST_HEAD(huge_boot_pages);
+
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+
/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
+/*
+ * Serializes faults on the same logical page. This is used to
+ * prevent spurious OOMs when the hugepage pool is fully utilized.
+ */
+static int num_fault_mutexes;
+static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
- int i;
+ bool free = (spool->count == 0) && (spool->used_hpages == 0);
+
+ spin_unlock(&spool->lock);
+
+ /* If no pages are used, and no other handles to the subpool
+ * remain, free the subpool the subpool remain */
+ if (free)
+ kfree(spool);
+}
+
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+ struct hugepage_subpool *spool;
+
+ spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ if (!spool)
+ return NULL;
+
+ spin_lock_init(&spool->lock);
+ spool->count = 1;
+ spool->max_hpages = nr_blocks;
+ spool->used_hpages = 0;
+
+ return spool;
+}
+
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+ spin_lock(&spool->lock);
+ BUG_ON(!spool->count);
+ spool->count--;
+ unlock_or_release_subpool(spool);
+}
+
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+ long delta)
+{
+ int ret = 0;
+
+ if (!spool)
+ return 0;
- might_sleep();
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
- cond_resched();
- clear_user_highpage(page + i, addr);
+ spin_lock(&spool->lock);
+ if ((spool->used_hpages + delta) <= spool->max_hpages) {
+ spool->used_hpages += delta;
+ } else {
+ ret = -ENOMEM;
}
+ spin_unlock(&spool->lock);
+
+ return ret;
}
-static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr)
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+ long delta)
{
- int i;
+ if (!spool)
+ return;
+
+ spin_lock(&spool->lock);
+ spool->used_hpages -= delta;
+ /* If hugetlbfs_put_super couldn't free spool due to
+ * an outstanding quota reference, free it now. */
+ unlock_or_release_subpool(spool);
+}
+
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+ return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+ return subpool_inode(file_inode(vma->vm_file));
+}
+
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ * across the pages in a mapping.
+ *
+ * The region data structures are embedded into a resv_map and
+ * protected by a resv_map's lock
+ */
+struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+};
+
+static long region_add(struct resv_map *resv, long f, long t)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg, *nrg, *trg;
+
+ spin_lock(&resv->lock);
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+
+ /* Check for and consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it. */
+ if (rg->to > t)
+ t = rg->to;
+ if (rg != nrg) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ nrg->from = f;
+ nrg->to = t;
+ spin_unlock(&resv->lock);
+ return 0;
+}
+
+static long region_chg(struct resv_map *resv, long f, long t)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg, *nrg = NULL;
+ long chg = 0;
+
+retry:
+ spin_lock(&resv->lock);
+ /* Locate the region we are before or in. */
+ list_for_each_entry(rg, head, link)
+ if (f <= rg->to)
+ break;
+
+ /* If we are below the current region then a new region is required.
+ * Subtle, allocate a new region at the position but make it zero
+ * size such that we can guarantee to record the reservation. */
+ if (&rg->link == head || t < rg->from) {
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+
+ nrg->from = f;
+ nrg->to = f;
+ INIT_LIST_HEAD(&nrg->link);
+ goto retry;
+ }
+
+ list_add(&nrg->link, rg->link.prev);
+ chg = t - f;
+ goto out_nrg;
+ }
+
+ /* Round our left edge to the current segment if it encloses us. */
+ if (f > rg->from)
+ f = rg->from;
+ chg = t - f;
+
+ /* Check for and consume any regions we now overlap with. */
+ list_for_each_entry(rg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > t)
+ goto out;
+
+ /* We overlap with this area, if it extends further than
+ * us then we must extend ourselves. Account for its
+ * existing reservation. */
+ if (rg->to > t) {
+ chg += rg->to - t;
+ t = rg->to;
+ }
+ chg -= rg->to - rg->from;
+ }
+
+out:
+ spin_unlock(&resv->lock);
+ /* We already know we raced and no longer need the new region */
+ kfree(nrg);
+ return chg;
+out_nrg:
+ spin_unlock(&resv->lock);
+ return chg;
+}
+
+static long region_truncate(struct resv_map *resv, long end)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg, *trg;
+ long chg = 0;
+
+ spin_lock(&resv->lock);
+ /* Locate the region we are either in or before. */
+ list_for_each_entry(rg, head, link)
+ if (end <= rg->to)
+ break;
+ if (&rg->link == head)
+ goto out;
+
+ /* If we are in the middle of a region then adjust it. */
+ if (end > rg->from) {
+ chg = rg->to - end;
+ rg->to = end;
+ rg = list_entry(rg->link.next, typeof(*rg), link);
+ }
+
+ /* Drop any remaining regions. */
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ chg += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+out:
+ spin_unlock(&resv->lock);
+ return chg;
+}
+
+static long region_count(struct resv_map *resv, long f, long t)
+{
+ struct list_head *head = &resv->regions;
+ struct file_region *rg;
+ long chg = 0;
+
+ spin_lock(&resv->lock);
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ long seg_from;
+ long seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+ spin_unlock(&resv->lock);
+
+ return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
+/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate;
+
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ hstate = hstate_vma(vma);
+
+ return 1UL << huge_page_shift(hstate);
+}
+EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+#endif
+
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
- might_sleep();
- for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+ return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+ unsigned long value)
+{
+ vma->vm_private_data = (void *)value;
+}
+
+struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ if (!resv_map)
+ return NULL;
+
+ kref_init(&resv_map->refs);
+ spin_lock_init(&resv_map->lock);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ return resv_map;
+}
+
+void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+ /* Clear out any active regions before we release the map. */
+ region_truncate(resv_map, 0);
+ kfree(resv_map);
+}
+
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+ return inode->i_mapping->private_data;
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ return inode_resv_map(inode);
+
+ } else {
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
}
}
-static void enqueue_huge_page(struct page *page)
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+ return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+{
+ if (vma->vm_flags & VM_NORESERVE) {
+ /*
+ * This address is already reserved by other process(chg == 0),
+ * so, we should decrement reserved count. Without decrementing,
+ * reserve count remains after releasing inode, because this
+ * allocated page will go into page cache and is regarded as
+ * coming from reserved pool in releasing step. Currently, we
+ * don't have any other solution to deal with this situation
+ * properly, so add work-around here.
+ */
+ if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+ return 1;
+ else
+ return 0;
+ }
+
+ /* Shared mappings always use reserves */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return 1;
+
+ /*
+ * Only the process that called mmap() has reserves for
+ * private mappings.
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return 1;
+
+ return 0;
+}
+
+static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &hugepage_freelists[nid]);
- free_huge_pages++;
- free_huge_pages_node[nid]++;
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
+ h->free_huge_pages++;
+ h->free_huge_pages_node[nid]++;
}
-static struct page *dequeue_huge_page(struct vm_area_struct *vma,
- unsigned long address)
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
- int nid = numa_node_id();
- struct page *page = NULL;
- struct zonelist *zonelist = huge_zonelist(vma, address);
- struct zone **z;
+ struct page *page;
- for (z = zonelist->zones; *z; z++) {
- nid = zone_to_nid(*z);
- if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
- !list_empty(&hugepage_freelists[nid]))
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+ if (!is_migrate_isolate_page(page))
break;
+ /*
+ * if 'non-isolated free hugepage' not found on the list,
+ * the allocation fails.
+ */
+ if (&h->hugepage_freelists[nid] == &page->lru)
+ return NULL;
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
+/* Movability of hugepages depends on migration support. */
+static inline gfp_t htlb_alloc_mask(struct hstate *h)
+{
+ if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+ return GFP_HIGHUSER_MOVABLE;
+ else
+ return GFP_HIGHUSER;
+}
+
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long address, int avoid_reserve,
+ long chg)
+{
+ struct page *page = NULL;
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * A child process with MAP_PRIVATE mappings created by their parent
+ * have no page reserves. This check ensures that reservations are
+ * not "stolen". The child may still get SIGKILLed
+ */
+ if (!vma_has_reserves(vma, chg) &&
+ h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+ /* If reserves cannot be used, ensure enough pages are in the pool */
+ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ goto err;
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = huge_zonelist(vma, address,
+ htlb_alloc_mask(h), &mpol, &nodemask);
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ MAX_NR_ZONES - 1, nodemask) {
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (avoid_reserve)
+ break;
+ if (!vma_has_reserves(vma, chg))
+ break;
+
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
+ break;
+ }
+ }
}
- if (*z) {
- page = list_entry(hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
+ mpol_cond_put(mpol);
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
+
+err:
+ return NULL;
+}
+
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __ClearPageTail(p);
+ set_page_refcounted(p);
+ p->first_page = NULL;
}
+
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+ free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ if (!pfn_valid(i))
+ return false;
+
+ page = pfn_to_page(i);
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+ unsigned long nr_pages = 1 << order;
+ unsigned long ret, pfn, flags;
+ struct zone *z;
+
+ z = NODE_DATA(nid)->node_zones;
+ for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+ spin_lock_irqsave(&z->lock, flags);
+
+ pfn = ALIGN(z->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&z->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+
+ spin_unlock_irqrestore(&z->lock, flags);
+ }
+
+ return NULL;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ page = alloc_gigantic_page(nid, huge_page_order(h));
+ if (page) {
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, nid);
+ }
+
return page;
}
-static void free_huge_page(struct page *page)
+static int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ struct page *page = NULL;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_gigantic_page_node(h, node);
+ if (page)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed) { return 0; }
+#endif
+
+static void update_and_free_page(struct hstate *h, struct page *page)
+{
+ int i;
+
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ return;
+
+ h->nr_huge_pages--;
+ h->nr_huge_pages_node[page_to_nid(page)]--;
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
+ }
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ set_compound_page_dtor(page, NULL);
+ set_page_refcounted(page);
+ if (hstate_is_gigantic(h)) {
+ destroy_compound_gigantic_page(page, huge_page_order(h));
+ free_gigantic_page(page, huge_page_order(h));
+ } else {
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+ }
+}
+
+struct hstate *size_to_hstate(unsigned long size)
{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ if (huge_page_size(h) == size)
+ return h;
+ }
+ return NULL;
+}
+
+void free_huge_page(struct page *page)
+{
+ /*
+ * Can't pass hstate in here because it is called from the
+ * compound page destructor.
+ */
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ struct hugepage_subpool *spool =
+ (struct hugepage_subpool *)page_private(page);
+ bool restore_reserve;
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
+ restore_reserve = PagePrivate(page);
+ ClearPagePrivate(page);
- INIT_LIST_HEAD(&page->lru);
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
+ if (restore_reserve)
+ h->resv_huge_pages++;
+
+ if (h->surplus_huge_pages_node[nid]) {
+ /* remove the page from active list */
+ list_del(&page->lru);
+ update_and_free_page(h, page);
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[nid]--;
+ } else {
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+ hugepage_subpool_put_pages(spool, 1);
+}
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+ INIT_LIST_HEAD(&page->lru);
+ set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
+ set_hugetlb_cgroup(page, NULL);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
}
-static int alloc_fresh_huge_page(void)
+static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ /* we rely on prep_new_huge_page to set the destructor */
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ __ClearPageReserved(page);
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __SetPageTail(p);
+ /*
+ * For gigantic hugepages allocated through bootmem at
+ * boot, it's safer to be consistent with the not-gigantic
+ * hugepages and clear the PG_reserved bit from all tail pages
+ * too. Otherwse drivers using get_user_pages() to access tail
+ * pages may get the reference counting wrong if they see
+ * PG_reserved set on a tail page (despite the head page not
+ * having PG_reserved set). Enforcing this consistency between
+ * head and tail pages allows drivers to optimize away a check
+ * on the head page when they need know if put_page() is needed
+ * after get_user_pages().
+ */
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ p->first_page = page;
+ }
+}
+
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
+int PageHuge(struct page *page)
+{
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ return get_compound_page_dtor(page) == free_huge_page;
+}
+EXPORT_SYMBOL_GPL(PageHuge);
+
+/*
+ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
+ * normal or transparent huge pages.
+ */
+int PageHeadHuge(struct page *page_head)
+{
+ if (!PageHead(page_head))
+ return 0;
+
+ return get_compound_page_dtor(page_head) == free_huge_page;
+}
+
+pgoff_t __basepage_index(struct page *page)
+{
+ struct page *page_head = compound_head(page);
+ pgoff_t index = page_index(page_head);
+ unsigned long compound_idx;
+
+ if (!PageHuge(page_head))
+ return page_index(page);
+
+ if (compound_order(page_head) >= MAX_ORDER)
+ compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+ else
+ compound_idx = page - page_head;
+
+ return (index << compound_order(page_head)) + compound_idx;
+}
+
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
- static int nid = 0;
struct page *page;
- page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
+
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
if (page) {
- page[1].lru.next = (void *)free_huge_page; /* dtor */
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
- return 1;
+ if (arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ return NULL;
+ }
+ prep_new_huge_page(h, page, nid);
}
- return 0;
+
+ return page;
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
- unsigned long addr)
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ struct page *page;
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_huge_page_node(h, node);
+ if (page) {
+ ret = 1;
+ break;
+ }
+ }
+
+ if (ret)
+ count_vm_event(HTLB_BUDDY_PGALLOC);
+ else
+ count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+ return ret;
+}
+
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ bool acct_surplus)
+{
+ int nr_nodes, node;
+ int ret = 0;
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+ !list_empty(&h->hugepage_freelists[node])) {
+ struct page *page =
+ list_entry(h->hugepage_freelists[node].next,
+ struct page, lru);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[node]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[node]--;
+ }
+ update_and_free_page(h, page);
+ ret = 1;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use (including surplus) hugepages.
+ */
+static void dissolve_free_huge_page(struct page *page)
+{
+ spin_lock(&hugetlb_lock);
+ if (PageHuge(page) && !page_count(page)) {
+ struct hstate *h = page_hstate(page);
+ int nid = page_to_nid(page);
+ list_del(&page->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ update_and_free_page(h, page);
+ }
+ spin_unlock(&hugetlb_lock);
+}
+
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that start_pfn should aligned with (minimum) hugepage size.
+ */
+void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned int order = 8 * sizeof(void *);
+ unsigned long pfn;
+ struct hstate *h;
+
+ /* Set scan step to minimum hugepage size */
+ for_each_hstate(h)
+ if (order > huge_page_order(h))
+ order = huge_page_order(h);
+ VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+ dissolve_free_huge_page(pfn_to_page(pfn));
+}
+
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
+ unsigned int r_nid;
+
+ if (hstate_is_gigantic(h))
+ return NULL;
+ /*
+ * Assume we will successfully allocate the surplus page to
+ * prevent racing processes from causing the surplus to exceed
+ * overcommit
+ *
+ * This however introduces a different race, where a process B
+ * tries to grow the static hugepage pool while alloc_pages() is
+ * called by process A. B will only examine the per-node
+ * counters in determining if surplus huge pages can be
+ * converted to normal huge pages in adjust_pool_surplus(). A
+ * won't be able to increment the per-node counter, until the
+ * lock is dropped by B, but B doesn't drop hugetlb_lock until
+ * no more huge pages can be converted from surplus to normal
+ * state (and doesn't try to convert again). Thus, we have a
+ * case where a surplus huge page exists, the pool is grown, and
+ * the surplus huge page still exists after, even though it
+ * should just have been converted to a normal huge page. This
+ * does not leak memory, though, as the hugepage will be freed
+ * once it is out of use. It also does not allow the counters to
+ * go out of whack in adjust_pool_surplus() as we don't modify
+ * the node values until we've gotten the hugepage and only the
+ * per-node value is checked there.
+ */
spin_lock(&hugetlb_lock);
- if (vma->vm_flags & VM_MAYSHARE)
- resv_huge_pages--;
- else if (free_huge_pages <= resv_huge_pages)
- goto fail;
+ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+ } else {
+ h->nr_huge_pages++;
+ h->surplus_huge_pages++;
+ }
+ spin_unlock(&hugetlb_lock);
- page = dequeue_huge_page(vma, addr);
- if (!page)
- goto fail;
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+
+ if (page && arch_prepare_hugepage(page)) {
+ __free_pages(page, huge_page_order(h));
+ page = NULL;
+ }
+ spin_lock(&hugetlb_lock);
+ if (page) {
+ INIT_LIST_HEAD(&page->lru);
+ r_nid = page_to_nid(page);
+ set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
+ /*
+ * We incremented the global counters already
+ */
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
+ __count_vm_event(HTLB_BUDDY_PGALLOC);
+ } else {
+ h->nr_huge_pages--;
+ h->surplus_huge_pages--;
+ __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ }
spin_unlock(&hugetlb_lock);
- set_page_refcounted(page);
+
return page;
+}
+
+/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page = NULL;
-fail:
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages - h->resv_huge_pages > 0)
+ page = dequeue_huge_page_node(h, nid);
spin_unlock(&hugetlb_lock);
- return NULL;
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
}
-static int __init hugetlb_init(void)
+/*
+ * Increase the hugetlb pool such that it can accommodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(struct hstate *h, int delta)
{
- unsigned long i;
-
- if (HPAGE_SHIFT == 0)
+ struct list_head surplus_list;
+ struct page *page, *tmp;
+ int ret, i;
+ int needed, allocated;
+ bool alloc_ok = true;
+
+ needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
+ if (needed <= 0) {
+ h->resv_huge_pages += delta;
return 0;
+ }
- for (i = 0; i < MAX_NUMNODES; ++i)
- INIT_LIST_HEAD(&hugepage_freelists[i]);
+ allocated = 0;
+ INIT_LIST_HEAD(&surplus_list);
- for (i = 0; i < max_huge_pages; ++i) {
- if (!alloc_fresh_huge_page())
+ ret = -ENOMEM;
+retry:
+ spin_unlock(&hugetlb_lock);
+ for (i = 0; i < needed; i++) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page) {
+ alloc_ok = false;
break;
+ }
+ list_add(&page->lru, &surplus_list);
}
- max_huge_pages = free_huge_pages = nr_huge_pages = i;
- printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
- return 0;
+ allocated += i;
+
+ /*
+ * After retaking hugetlb_lock, we need to recalculate 'needed'
+ * because either resv_huge_pages or free_huge_pages may have changed.
+ */
+ spin_lock(&hugetlb_lock);
+ needed = (h->resv_huge_pages + delta) -
+ (h->free_huge_pages + allocated);
+ if (needed > 0) {
+ if (alloc_ok)
+ goto retry;
+ /*
+ * We were not able to allocate enough pages to
+ * satisfy the entire reservation so we free what
+ * we've allocated so far.
+ */
+ goto free;
+ }
+ /*
+ * The surplus_list now contains _at_least_ the number of extra pages
+ * needed to accommodate the reservation. Add the appropriate number
+ * of pages to the hugetlb pool and free the extras back to the buddy
+ * allocator. Commit the entire reservation here to prevent another
+ * process from stealing the pages as they are added to the pool but
+ * before they are reserved.
+ */
+ needed += allocated;
+ h->resv_huge_pages += delta;
+ ret = 0;
+
+ /* Free the needed pages to the hugetlb pool */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+ if ((--needed) < 0)
+ break;
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ enqueue_huge_page(h, page);
+ }
+free:
+ spin_unlock(&hugetlb_lock);
+
+ /* Free unnecessary surplus pages to the buddy allocator */
+ list_for_each_entry_safe(page, tmp, &surplus_list, lru)
+ put_page(page);
+ spin_lock(&hugetlb_lock);
+
+ return ret;
}
-module_init(hugetlb_init);
-static int __init hugetlb_setup(char *s)
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ * Called with hugetlb_lock held.
+ */
+static void return_unused_surplus_pages(struct hstate *h,
+ unsigned long unused_resv_pages)
+{
+ unsigned long nr_pages;
+
+ /* Uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
+
+ /* Cannot return gigantic pages currently */
+ if (hstate_is_gigantic(h))
+ return;
+
+ nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
+
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes with memory. Iterate across these nodes
+ * until we can no longer free unreserved surplus pages. This occurs
+ * when the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the freed pages across the
+ * on-line nodes with memory and will handle the hstate accounting.
+ */
+ while (nr_pages--) {
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+ break;
+ cond_resched_lock(&hugetlb_lock);
+ }
+}
+
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation. Where it does not we will need to logically increase
+ * reservation and actually increase subpool usage before an allocation
+ * can occur. Where any new reservation would be required the
+ * reservation change is prepared, but not committed. Once the page
+ * has been allocated from the subpool and instantiated the change should
+ * be committed via vma_commit_reservation. No action is required on
+ * failure.
+ */
+static long vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct resv_map *resv;
+ pgoff_t idx;
+ long chg;
+
+ resv = vma_resv_map(vma);
+ if (!resv)
+ return 1;
+
+ idx = vma_hugecache_offset(h, vma, addr);
+ chg = region_chg(resv, idx, idx + 1);
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ return chg;
+ else
+ return chg < 0 ? chg : 0;
+}
+static void vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
{
- if (sscanf(s, "%lu", &max_huge_pages) <= 0)
- max_huge_pages = 0;
+ struct resv_map *resv;
+ pgoff_t idx;
+
+ resv = vma_resv_map(vma);
+ if (!resv)
+ return;
+
+ idx = vma_hugecache_offset(h, vma, addr);
+ region_add(resv, idx, idx + 1);
+}
+
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ struct hstate *h = hstate_vma(vma);
+ struct page *page;
+ long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+
+ idx = hstate_index(h);
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves and will not have accounted against subpool
+ * limit. Check that the subpool limit can be made before
+ * satisfying the allocation MAP_NORESERVE mappings may also
+ * need pages and subpool limit allocated allocated if no reserve
+ * mapping overlaps.
+ */
+ chg = vma_needs_reservation(h, vma, addr);
+ if (chg < 0)
+ return ERR_PTR(-ENOMEM);
+ if (chg || avoid_reserve)
+ if (hugepage_subpool_get_pages(spool, 1))
+ return ERR_PTR(-ENOSPC);
+
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_subpool_put;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ if (!page) {
+ spin_unlock(&hugetlb_lock);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
+ goto out_uncharge_cgroup;
+
+ spin_lock(&hugetlb_lock);
+ list_move(&page->lru, &h->hugepage_activelist);
+ /* Fall through */
+ }
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ spin_unlock(&hugetlb_lock);
+
+ set_page_private(page, (unsigned long)spool);
+
+ vma_commit_reservation(h, vma, addr);
+ return page;
+
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_subpool_put:
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ return ERR_PTR(-ENOSPC);
+}
+
+/*
+ * alloc_huge_page()'s wrapper which simply returns the page if allocation
+ * succeeds, otherwise NULL. This function is called from new_vma_page(),
+ * where no ERR_VALUE is expected to be returned.
+ */
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+{
+ struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
+ if (IS_ERR(page))
+ page = NULL;
+ return page;
+}
+
+int __weak alloc_bootmem_huge_page(struct hstate *h)
+{
+ struct huge_bootmem_page *m;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
+ void *addr;
+
+ addr = memblock_virt_alloc_try_nid_nopanic(
+ huge_page_size(h), huge_page_size(h),
+ 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
+ if (addr) {
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ m = addr;
+ goto found;
+ }
+ }
+ return 0;
+
+found:
+ BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+ /* Put them into a private list first because mem_map is not up yet */
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = h;
return 1;
}
-__setup("hugepages=", hugetlb_setup);
-#ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void __init prep_compound_huge_page(struct page *page, int order)
{
- int i;
- nr_huge_pages--;
- nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
- }
- page[1].lru.next = NULL;
- set_page_refcounted(page);
- __free_pages(page, HUGETLB_PAGE_ORDER);
+ if (unlikely(order > (MAX_ORDER - 1)))
+ prep_compound_gigantic_page(page, order);
+ else
+ prep_compound_page(page, order);
}
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
+{
+ struct huge_bootmem_page *m;
+
+ list_for_each_entry(m, &huge_boot_pages, list) {
+ struct hstate *h = m->hstate;
+ struct page *page;
+
#ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ memblock_free_late(__pa(m),
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
+ WARN_ON(page_count(page) != 1);
+ prep_compound_huge_page(page, h->order);
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (hstate_is_gigantic(h))
+ adjust_managed_page_count(page, 1 << h->order);
+ }
+}
+
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+ unsigned long i;
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h))
+ break;
+ } else if (!alloc_fresh_huge_page(h,
+ &node_states[N_MEMORY]))
+ break;
+ }
+ h->max_huge_pages = i;
+}
+
+static void __init hugetlb_init_hstates(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /* oversize hugepages were init'ed in early boot */
+ if (!hstate_is_gigantic(h))
+ hugetlb_hstate_alloc_pages(h);
+ }
+}
+
+static char * __init memfmt(char *buf, unsigned long n)
+{
+ if (n >= (1UL << 30))
+ sprintf(buf, "%lu GB", n >> 30);
+ else if (n >= (1UL << 20))
+ sprintf(buf, "%lu MB", n >> 20);
+ else
+ sprintf(buf, "%lu KB", n >> 10);
+ return buf;
+}
+
+static void __init report_hugepages(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ char buf[32];
+ pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
+ memfmt(buf, huge_page_size(h)),
+ h->free_huge_pages);
+ }
+}
+
+#ifdef CONFIG_HIGHMEM
+static void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
{
int i;
- for (i = 0; i < MAX_NUMNODES; ++i) {
+ if (hstate_is_gigantic(h))
+ return;
+
+ for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next;
- list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+ struct list_head *freel = &h->hugepage_freelists[i];
+ list_for_each_entry_safe(page, next, freel, lru) {
+ if (count >= h->nr_huge_pages)
+ return;
if (PageHighMem(page))
continue;
list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[page_to_nid(page)]--;
- if (count >= nr_huge_pages)
- return;
+ update_and_free_page(h, page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[page_to_nid(page)]--;
}
}
}
#else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
{
}
#endif
-static unsigned long set_max_huge_pages(unsigned long count)
+/*
+ * Increment or decrement surplus_huge_pages. Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+ int delta)
{
- while (count > nr_huge_pages) {
- if (!alloc_fresh_huge_page())
- return nr_huge_pages;
+ int nr_nodes, node;
+
+ VM_BUG_ON(delta != -1 && delta != 1);
+
+ if (delta < 0) {
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node])
+ goto found;
+ }
+ } else {
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (h->surplus_huge_pages_node[node] <
+ h->nr_huge_pages_node[node])
+ goto found;
+ }
}
- if (count >= nr_huge_pages)
- return nr_huge_pages;
+ return 0;
+
+found:
+ h->surplus_huge_pages += delta;
+ h->surplus_huge_pages_node[node] += delta;
+ return 1;
+}
+
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+ nodemask_t *nodes_allowed)
+{
+ unsigned long min_count, ret;
+
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ return h->max_huge_pages;
+ /*
+ * Increase the pool size
+ * First take pages out of surplus state. Then make up the
+ * remaining difference by allocating fresh huge pages.
+ *
+ * We might race with alloc_buddy_huge_page() here and be unable
+ * to convert a surplus huge page to a normal huge page. That is
+ * not critical, though, it just means the overall size of the
+ * pool might be one hugepage larger than it needs to be, but
+ * within all the constraints specified by the sysctls.
+ */
spin_lock(&hugetlb_lock);
- count = max(count, resv_huge_pages);
- try_to_free_low(count);
- while (count < nr_huge_pages) {
- struct page *page = dequeue_huge_page(NULL, 0);
- if (!page)
+ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
- update_and_free_page(page);
}
+
+ while (count > persistent_huge_pages(h)) {
+ /*
+ * If this allocation races such that we no longer need the
+ * page, free_huge_page will handle it by freeing the page
+ * and reducing the surplus.
+ */
+ spin_unlock(&hugetlb_lock);
+ if (hstate_is_gigantic(h))
+ ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+ else
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
+ spin_lock(&hugetlb_lock);
+ if (!ret)
+ goto out;
+
+ /* Bail for signals. Probably ctrl-c from user */
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /*
+ * Decrease the pool size
+ * First return free pages to the buddy allocator (being careful
+ * to keep enough around to satisfy reservations). Then place
+ * pages into surplus state as needed so the pool will shrink
+ * to the desired size as pages become free.
+ *
+ * By placing pages into the surplus state independent of the
+ * overcommit value, we are allowing the surplus pool size to
+ * exceed overcommit. There are few sane options here. Since
+ * alloc_buddy_huge_page() is checking the global counter,
+ * though, we'll note that we're not allowed to exceed surplus
+ * and won't grow the pool anywhere else. Not until one of the
+ * sysctls are changed, or the surplus pages go out of use.
+ */
+ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ min_count = max(count, min_count);
+ try_to_free_low(h, min_count, nodes_allowed);
+ while (min_count < persistent_huge_pages(h)) {
+ if (!free_pool_huge_page(h, nodes_allowed, 0))
+ break;
+ cond_resched_lock(&hugetlb_lock);
+ }
+ while (count < persistent_huge_pages(h)) {
+ if (!adjust_pool_surplus(h, nodes_allowed, 1))
+ break;
+ }
+out:
+ ret = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return nr_huge_pages;
+ return ret;
}
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
- size_t *length, loff_t *ppos)
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
- max_huge_pages = set_max_huge_pages(max_huge_pages);
+ int i;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = NUMA_NO_NODE;
+ return &hstates[i];
+ }
+
+ return kobj_to_node_hstate(kobj, nidp);
+}
+
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long nr_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ nr_huge_pages = h->nr_huge_pages;
+ else
+ nr_huge_pages = h->nr_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", nr_huge_pages);
+}
+
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int err;
+ int nid;
+ unsigned long count;
+ struct hstate *h;
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+
+ err = kstrtoul(buf, 10, &count);
+ if (err)
+ goto out;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (nid == NUMA_NO_NODE) {
+ /*
+ * global hstate attribute
+ */
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_MEMORY];
+ }
+ } else if (nodes_allowed) {
+ /*
+ * per node hstate attribute: adjust count to global,
+ * but restrict alloc/free to the specified node.
+ */
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ init_nodemask_of_node(nodes_allowed, nid);
+ } else
+ nodes_allowed = &node_states[N_MEMORY];
+
+ h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+
+ if (nodes_allowed != &node_states[N_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+
+ return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(false, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages);
+
+#ifdef CONFIG_NUMA
+
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(true, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
+
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+
+ if (hstate_is_gigantic(h))
+ return -EINVAL;
+
+ err = kstrtoul(buf, 10, &input);
+ if (err)
+ return err;
+
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long free_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ free_huge_pages = h->free_huge_pages;
+ else
+ free_huge_pages = h->free_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long surplus_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ surplus_huge_pages = h->surplus_huge_pages;
+ else
+ surplus_huge_pages = h->surplus_huge_pages_node[nid];
+
+ return sprintf(buf, "%lu\n", surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+ &nr_hugepages_mempolicy_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ struct attribute_group *hstate_attr_group)
+{
+ int retval;
+ int hi = hstate_index(h);
+
+ hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
+ if (!hstate_kobjs[hi])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
+ if (retval)
+ kobject_put(hstate_kobjs[hi]);
+
+ return retval;
+}
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ }
+}
+
+#ifdef CONFIG_NUMA
+
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+ struct kobject *hugepages_kobj;
+ struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+struct node_hstate node_hstates[MAX_NUMNODES];
+
+/*
+ * A subset of global hstate attributes for node devices
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static struct attribute_group per_node_hstate_attr_group = {
+ .attrs = per_node_hstate_attrs,
+};
+
+/*
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++) {
+ struct node_hstate *nhs = &node_hstates[nid];
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (nhs->hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = nid;
+ return &hstates[i];
+ }
+ }
+
+ BUG();
+ return NULL;
+}
+
+/*
+ * Unregister hstate attributes from a single node device.
+ * No-op if no hstate attributes attached.
+ */
+static void hugetlb_unregister_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+
+ if (!nhs->hugepages_kobj)
+ return; /* no hstate attributes */
+
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
+ }
+ }
+
+ kobject_put(nhs->hugepages_kobj);
+ nhs->hugepages_kobj = NULL;
+}
+
+/*
+ * hugetlb module exit: unregister hstate attributes from node devices
+ * that have them.
+ */
+static void hugetlb_unregister_all_nodes(void)
+{
+ int nid;
+
+ /*
+ * disable node device registrations.
+ */
+ register_hugetlbfs_with_node(NULL, NULL);
+
+ /*
+ * remove hstate attributes from any nodes that have them.
+ */
+ for (nid = 0; nid < nr_node_ids; nid++)
+ hugetlb_unregister_node(node_devices[nid]);
+}
+
+/*
+ * Register hstate attributes for a single node device.
+ * No-op if attributes already registered.
+ */
+static void hugetlb_register_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+ int err;
+
+ if (nhs->hugepages_kobj)
+ return; /* already allocated */
+
+ nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+ &node->dev.kobj);
+ if (!nhs->hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+ nhs->hstate_kobjs,
+ &per_node_hstate_attr_group);
+ if (err) {
+ pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ h->name, node->dev.id);
+ hugetlb_unregister_node(node);
+ break;
+ }
+ }
+}
+
+/*
+ * hugetlb init time: register hstate attributes for all registered node
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
+ */
+static void hugetlb_register_all_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
+ if (node->dev.id == nid)
+ hugetlb_register_node(node);
+ }
+
+ /*
+ * Let the node device driver know we're here so it can
+ * [un]register hstate attributes on node hotplug.
+ */
+ register_hugetlbfs_with_node(hugetlb_register_node,
+ hugetlb_unregister_node);
+}
+#else /* !CONFIG_NUMA */
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ BUG();
+ if (nidp)
+ *nidp = -1;
+ return NULL;
+}
+
+static void hugetlb_unregister_all_nodes(void) { }
+
+static void hugetlb_register_all_nodes(void) { }
+
+#endif
+
+static void __exit hugetlb_exit(void)
+{
+ struct hstate *h;
+
+ hugetlb_unregister_all_nodes();
+
+ for_each_hstate(h) {
+ kobject_put(hstate_kobjs[hstate_index(h)]);
+ }
+
+ kobject_put(hugepages_kobj);
+ kfree(htlb_fault_mutex_table);
+}
+module_exit(hugetlb_exit);
+
+static int __init hugetlb_init(void)
+{
+ int i;
+
+ if (!hugepages_supported())
+ return 0;
+
+ if (!size_to_hstate(default_hstate_size)) {
+ default_hstate_size = HPAGE_SIZE;
+ if (!size_to_hstate(default_hstate_size))
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ }
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
+ if (default_hstate_max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+
+ hugetlb_init_hstates();
+ gather_bootmem_prealloc();
+ report_hugepages();
+
+ hugetlb_sysfs_init();
+ hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
+
+#ifdef CONFIG_SMP
+ num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
+#else
+ num_fault_mutexes = 1;
+#endif
+ htlb_fault_mutex_table =
+ kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
+ BUG_ON(!htlb_fault_mutex_table);
+
+ for (i = 0; i < num_fault_mutexes; i++)
+ mutex_init(&htlb_fault_mutex_table[i]);
return 0;
}
+module_init(hugetlb_init);
+
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+ struct hstate *h;
+ unsigned long i;
+
+ if (size_to_hstate(PAGE_SIZE << order)) {
+ pr_warning("hugepagesz= specified twice, ignoring\n");
+ return;
+ }
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(order == 0);
+ h = &hstates[hugetlb_max_hstate++];
+ h->order = order;
+ h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->nr_huge_pages = 0;
+ h->free_huge_pages = 0;
+ for (i = 0; i < MAX_NUMNODES; ++i)
+ INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
+ h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+ snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+ huge_page_size(h)/1024);
+
+ parsed_hstate = h;
+}
+
+static int __init hugetlb_nrpages_setup(char *s)
+{
+ unsigned long *mhp;
+ static unsigned long *last_mhp;
+
+ /*
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * so this hugepages= parameter goes to the "default hstate".
+ */
+ if (!hugetlb_max_hstate)
+ mhp = &default_hstate_max_huge_pages;
+ else
+ mhp = &parsed_hstate->max_huge_pages;
+
+ if (mhp == last_mhp) {
+ pr_warning("hugepages= specified twice without "
+ "interleaving hugepagesz=, ignoring\n");
+ return 1;
+ }
+
+ if (sscanf(s, "%lu", mhp) <= 0)
+ *mhp = 0;
+
+ /*
+ * Global state is always initialized later in hugetlb_init.
+ * But we need to allocate >= MAX_ORDER hstates here early to still
+ * use the bootmem allocator.
+ */
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+ hugetlb_hstate_alloc_pages(parsed_hstate);
+
+ last_mhp = mhp;
+
+ return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+
+static int __init hugetlb_default_setup(char *s)
+{
+ default_hstate_size = memparse(s, &s);
+ return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
+#ifdef CONFIG_SYSCTL
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
+ tmp = h->max_huge_pages;
+
+ if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
+ return -EINVAL;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
+
+ if (write) {
+ NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+ GFP_KERNEL | __GFP_NORETRY);
+ if (!(obey_mempolicy &&
+ init_nodemask_of_mempolicy(nodes_allowed))) {
+ NODEMASK_FREE(nodes_allowed);
+ nodes_allowed = &node_states[N_MEMORY];
+ }
+ h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
+
+ if (nodes_allowed != &node_states[N_MEMORY])
+ NODEMASK_FREE(nodes_allowed);
+ }
+out:
+ return ret;
+}
+
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+
+ return hugetlb_sysctl_handler_common(false, table, write,
+ buffer, length, ppos);
+}
+
+#ifdef CONFIG_NUMA
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ return hugetlb_sysctl_handler_common(true, table, write,
+ buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
+
+int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && hstate_is_gigantic(h))
+ return -EINVAL;
+
+ table->data = &tmp;
+ table->maxlen = sizeof(unsigned long);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
+
+ if (write) {
+ spin_lock(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock(&hugetlb_lock);
+ }
+out:
+ return ret;
+}
+
#endif /* CONFIG_SYSCTL */
-int hugetlb_report_meminfo(char *buf)
+void hugetlb_report_meminfo(struct seq_file *m)
{
- return sprintf(buf,
- "HugePages_Total: %5lu\n"
- "HugePages_Free: %5lu\n"
- "HugePages_Rsvd: %5lu\n"
- "Hugepagesize: %5lu kB\n",
- nr_huge_pages,
- free_huge_pages,
- resv_huge_pages,
- HPAGE_SIZE/1024);
+ struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return;
+ seq_printf(m,
+ "HugePages_Total: %5lu\n"
+ "HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
+ "HugePages_Surp: %5lu\n"
+ "Hugepagesize: %8lu kB\n",
+ h->nr_huge_pages,
+ h->free_huge_pages,
+ h->resv_huge_pages,
+ h->surplus_huge_pages,
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
int hugetlb_report_node_meminfo(int nid, char *buf)
{
+ struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
- "Node %d HugePages_Free: %5u\n",
- nid, nr_huge_pages_node[nid],
- nid, free_huge_pages_node[nid]);
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
+}
+
+void hugetlb_show_meminfo(void)
+{
+ struct hstate *h;
+ int nid;
+
+ if (!hugepages_supported())
+ return;
+
+ for_each_node_state(nid, N_MEMORY)
+ for_each_hstate(h)
+ pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
}
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+ struct hstate *h;
+ unsigned long nr_total_pages = 0;
+
+ for_each_hstate(h)
+ nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+ return nr_total_pages;
+}
+
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+ int ret = -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (delta > 0) {
+ if (gather_surplus_pages(h, delta) < 0)
+ goto out;
+
+ if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ return_unused_surplus_pages(h, delta);
+ goto out;
+ }
+ }
+
+ ret = 0;
+ if (delta < 0)
+ return_unused_surplus_pages(h, (unsigned long) -delta);
+
+out:
+ spin_unlock(&hugetlb_lock);
+ return ret;
+}
+
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *resv = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot disappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_get(&resv->refs);
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct resv_map *resv = vma_resv_map(vma);
+ struct hugepage_subpool *spool = subpool_vma(vma);
+ unsigned long reserve, start, end;
+
+ if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return;
+
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
+
+ reserve = (end - start) - region_count(resv, start, end);
+
+ kref_put(&resv->refs, resv_map_release);
+
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugepage_subpool_put_pages(spool, reserve);
+ }
}
/*
@@ -279,15 +2479,16 @@ unsigned long hugetlb_total_pages(void)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
- unsigned long address, int *unused)
+static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
- return NULL;
+ return 0;
}
-struct vm_operations_struct hugetlb_vm_ops = {
- .nopage = hugetlb_nopage,
+const struct vm_operations_struct hugetlb_vm_ops = {
+ .fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
+ .close = hugetlb_vm_op_close,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -296,13 +2497,15 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
pte_t entry;
if (writable) {
- entry =
- pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
+ vma->vm_page_prot)));
} else {
- entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ entry = huge_pte_wrprotect(mk_huge_pte(page,
+ vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, vma, page, writable);
return entry;
}
@@ -312,12 +2515,36 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = pte_mkwrite(pte_mkdirty(*ptep));
- ptep_set_access_flags(vma, address, ptep, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
+ update_mmu_cache(vma, address, ptep);
+}
+
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -326,126 +2553,436 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, addr);
- if (!dst_pte)
- goto nomem;
- spin_lock(&dst->page_table_lock);
- spin_lock(&src->page_table_lock);
- if (!pte_none(*src_pte)) {
+ dst_pte = huge_pte_alloc(dst, addr, sz);
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* If the pagetables are shared don't copy or take references */
+ if (dst_pte == src_pte)
+ continue;
+
+ dst_ptl = huge_pte_lock(h, dst, dst_pte);
+ src_ptl = huge_pte_lockptr(h, src, src_pte);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ entry = huge_ptep_get(src_pte);
+ if (huge_pte_none(entry)) { /* skip none entry */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
+ }
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else {
if (cow)
- ptep_set_wrprotect(src, addr, src_pte);
- entry = *src_pte;
+ huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
- spin_unlock(&src->page_table_lock);
- spin_unlock(&dst->page_table_lock);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
}
- return 0;
-nomem:
- return -ENOMEM;
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+
+ return ret;
}
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
+ spinlock_t *ptl;
struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
+ const unsigned long mmun_start = start; /* For mmu_notifiers */
+ const unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(start & ~HPAGE_MASK);
- BUG_ON(end & ~HPAGE_MASK);
-
- spin_lock(&mm->page_table_lock);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
+ BUG_ON(start & ~huge_page_mask(h));
+ BUG_ON(end & ~huge_page_mask(h));
- for (address = start; address < end; address += HPAGE_SIZE) {
+ tlb_start_vma(tlb, vma);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+again:
+ for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, &address, ptep))
+ goto unlock;
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte))
+ goto unlock;
+
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ huge_pte_clear(mm, address, ptep);
+ goto unlock;
+ }
+
+ page = pte_page(pte);
+ /*
+ * If a reference page is supplied, it is because a specific
+ * page is being unmapped, not a range. Ensure the page we
+ * are about to unmap is the actual page of interest.
+ */
+ if (ref_page) {
+ if (page != ref_page)
+ goto unlock;
+
+ /*
+ * Mark the VMA as having unmapped its page so that
+ * future faults in this VMA will fail rather than
+ * looking like data was lost
+ */
+ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+ }
+
pte = huge_ptep_get_and_clear(mm, address, ptep);
- if (pte_none(pte))
+ tlb_remove_tlb_entry(tlb, ptep, address);
+ if (huge_pte_dirty(pte))
+ set_page_dirty(page);
+
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush) {
+ spin_unlock(ptl);
+ break;
+ }
+ /* Bail out after unmapping reference page if supplied */
+ if (ref_page) {
+ spin_unlock(ptl);
+ break;
+ }
+unlock:
+ spin_unlock(ptl);
+ }
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
+ }
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
+}
+
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page *page, unsigned long address)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct vm_area_struct *iter_vma;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ /*
+ * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+ * from page cache lookup which is in HPAGE_SIZE units.
+ */
+ address = address & huge_page_mask(h);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ mapping = file_inode(vma->vm_file)->i_mapping;
+
+ /*
+ * Take the mapping lock for the duration of the table walk. As
+ * this mapping should be shared between all the VMAs,
+ * __unmap_hugepage_range() is called as the lock is already held
+ */
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
+ /* Do not unmap the current VMA */
+ if (iter_vma == vma)
continue;
- page = pte_page(pte);
- put_page(page);
- add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
+ /*
+ * Unmap the page from other VMAs without their own reserves.
+ * They get marked to be SIGKILLed if they fault in these
+ * areas. This is because a future no-page fault on this VMA
+ * could insert a zeroed page instead of the data existing
+ * from the time of fork. This would look like data corruption
+ */
+ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
+ mutex_unlock(&mapping->i_mmap_mutex);
- spin_unlock(&mm->page_table_lock);
- flush_tlb_range(vma, start, end);
+ return 1;
}
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, pte_t pte)
+ unsigned long address, pte_t *ptep, pte_t pte,
+ struct page *pagecache_page, spinlock_t *ptl)
{
+ struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int avoidcopy;
+ int outside_reserve = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_page = pte_page(pte);
+retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
- if (avoidcopy) {
+ if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
- return VM_FAULT_MINOR;
+ return 0;
}
+ /*
+ * If the process that created a MAP_PRIVATE mapping is about to
+ * perform a COW due to a shared page count, attempt to satisfy
+ * the allocation without using the existing reserves. The pagecache
+ * page is used to determine if the reserve at this address was
+ * consumed or not. If reserves were used, a partial faulted mapping
+ * at the time of fork() could consume its reserves on COW instead
+ * of the full address range.
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+ old_page != pagecache_page)
+ outside_reserve = 1;
+
page_cache_get(old_page);
- new_page = alloc_huge_page(vma, address);
- if (!new_page) {
+ /* Drop page table lock as buddy allocator may be called */
+ spin_unlock(ptl);
+ new_page = alloc_huge_page(vma, address, outside_reserve);
+
+ if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
+
+ /*
+ * If a process owning a MAP_PRIVATE mapping fails to COW,
+ * it is due to references held by a child and an insufficient
+ * huge page pool. To guarantee the original mappers
+ * reliability, unmap the page from child processes. The child
+ * may get SIGKILLed if it later faults.
+ */
+ if (outside_reserve) {
+ BUG_ON(huge_pte_none(pte));
+ if (unmap_ref_private(mm, vma, old_page, address)) {
+ BUG_ON(huge_pte_none(pte));
+ spin_lock(ptl);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
+ goto retry_avoidcopy;
+ /*
+ * race occurs while re-acquiring page table
+ * lock, and our job is done.
+ */
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ }
+
+ /* Caller expects lock to be held */
+ spin_lock(ptl);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
+ }
+
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma))) {
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ /* Caller expects lock to be held */
+ spin_lock(ptl);
return VM_FAULT_OOM;
}
- spin_unlock(&mm->page_table_lock);
- copy_huge_page(new_page, old_page, address);
- spin_lock(&mm->page_table_lock);
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
+ __SetPageUptodate(new_page);
+
+ mmun_start = address & huge_page_mask(h);
+ mmun_end = mmun_start + huge_page_size(h);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ /*
+ * Retake the page table lock to check for racing updates
+ * before the page tables are altered
+ */
+ spin_lock(ptl);
+ ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
+ ClearPagePrivate(new_page);
- ptep = huge_pte_offset(mm, address & HPAGE_MASK);
- if (likely(pte_same(*ptep, pte))) {
/* Break COW */
+ huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
}
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
page_cache_release(new_page);
page_cache_release(old_page);
- return VM_FAULT_MINOR;
+
+ /* Caller expects lock to be held */
+ spin_lock(ptl);
+ return 0;
}
-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, int write_access)
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
{
+ struct address_space *mapping;
+ pgoff_t idx;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ return find_lock_page(mapping, idx);
+}
+
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct address_space *mapping, pgoff_t idx,
+ unsigned long address, pte_t *ptep, unsigned int flags)
+{
+ struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
- unsigned long idx;
+ int anon_rmap = 0;
unsigned long size;
struct page *page;
- struct address_space *mapping;
pte_t new_pte;
+ spinlock_t *ptl;
- mapping = vma->vm_file->f_mapping;
- idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+ /*
+ * Currently, we are forced to kill the process in the event the
+ * original mapper has unmapped pages from the child due to a failed
+ * COW. Warn that such a situation has occurred as it may not be obvious
+ */
+ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+ pr_warning("PID %d killed due to inadequate hugepage pool\n",
+ current->pid);
+ return ret;
+ }
/*
* Use page lock to guard against racing truncation
@@ -454,142 +2991,333 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- if (hugetlb_get_quota(mapping))
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
goto out;
- page = alloc_huge_page(vma, address);
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = VM_FAULT_OOM;
+ page = alloc_huge_page(vma, address, 0);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
- clear_huge_page(page, address);
+ clear_huge_page(page, address, pages_per_huge_page(h));
+ __SetPageUptodate(page);
- if (vma->vm_flags & VM_SHARED) {
+ if (vma->vm_flags & VM_MAYSHARE) {
int err;
+ struct inode *inode = mapping->host;
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err) {
put_page(page);
- hugetlb_put_quota(mapping);
if (err == -EEXIST)
goto retry;
goto out;
}
- } else
+ ClearPagePrivate(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ } else {
lock_page(page);
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ anon_rmap = 1;
+ }
+ } else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto backout_unlocked;
+ }
}
- spin_lock(&mm->page_table_lock);
- size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ /*
+ * If we are going to COW a private mapping later, we examine the
+ * pending reservations for this page now. This will ensure that
+ * any allocations necessary to record that reservation occur outside
+ * the spinlock.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+
+ ptl = huge_pte_lockptr(h, mm, ptep);
+ spin_lock(ptl);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
- ret = VM_FAULT_MINOR;
- if (!pte_none(*ptep))
+ ret = 0;
+ if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ if (anon_rmap) {
+ ClearPagePrivate(page);
+ hugepage_add_new_anon_rmap(page, vma, address);
+ } else
+ page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
- if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
unlock_page(page);
out:
return ret;
backout:
- spin_unlock(&mm->page_table_lock);
- hugetlb_put_quota(mapping);
+ spin_unlock(ptl);
+backout_unlocked:
unlock_page(page);
put_page(page);
goto out;
}
+#ifdef CONFIG_SMP
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address)
+{
+ unsigned long key[2];
+ u32 hash;
+
+ if (vma->vm_flags & VM_SHARED) {
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
+ } else {
+ key[0] = (unsigned long) mm;
+ key[1] = address >> huge_page_shift(h);
+ }
+
+ hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+ return hash & (num_fault_mutexes - 1);
+}
+#else
+/*
+ * For uniprocesor systems we always use a single mutex, so just
+ * return 0 and avoid the hashing overhead.
+ */
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address)
+{
+ return 0;
+}
+#endif
+
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+ unsigned long address, unsigned int flags)
{
- pte_t *ptep;
- pte_t entry;
+ pte_t *ptep, entry;
+ spinlock_t *ptl;
int ret;
- static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+ u32 hash;
+ pgoff_t idx;
+ struct page *page = NULL;
+ struct page *pagecache_page = NULL;
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping;
+
+ address &= huge_page_mask(h);
+
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait_huge(vma, mm, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ }
- ptep = huge_pte_alloc(mm, address);
+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- mutex_lock(&hugetlb_instantiation_mutex);
- entry = *ptep;
- if (pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
- mutex_unlock(&hugetlb_instantiation_mutex);
- return ret;
+ hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&htlb_fault_mutex_table[hash]);
+
+ entry = huge_ptep_get(ptep);
+ if (huge_pte_none(entry)) {
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+ goto out_mutex;
}
- ret = VM_FAULT_MINOR;
+ ret = 0;
- spin_lock(&mm->page_table_lock);
+ /*
+ * If we are going to COW the mapping later, we examine the pending
+ * reservations for this page now. This will ensure that any
+ * allocations necessary to record that reservation occur outside the
+ * spinlock. For private mappings, we also lookup the pagecache
+ * page now as it is used to determine if a reservation has been
+ * consumed.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if (vma_needs_reservation(h, vma, address) < 0) {
+ ret = VM_FAULT_OOM;
+ goto out_mutex;
+ }
+
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ pagecache_page = hugetlbfs_pagecache_page(h,
+ vma, address);
+ }
+
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ * Note that locking order is always pagecache_page -> page,
+ * so no worry about deadlock.
+ */
+ page = pte_page(entry);
+ get_page(page);
+ if (page != pagecache_page)
+ lock_page(page);
+
+ ptl = huge_pte_lockptr(h, mm, ptep);
+ spin_lock(ptl);
/* Check for a racing update before calling hugetlb_cow */
- if (likely(pte_same(entry, *ptep)))
- if (write_access && !pte_write(entry))
- ret = hugetlb_cow(mm, vma, address, ptep, entry);
- spin_unlock(&mm->page_table_lock);
- mutex_unlock(&hugetlb_instantiation_mutex);
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_ptl;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ if (!huge_pte_write(entry)) {
+ ret = hugetlb_cow(mm, vma, address, ptep, entry,
+ pagecache_page, ptl);
+ goto out_ptl;
+ }
+ entry = huge_pte_mkdirty(entry);
+ }
+ entry = pte_mkyoung(entry);
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+ flags & FAULT_FLAG_WRITE))
+ update_mmu_cache(vma, address, ptep);
+
+out_ptl:
+ spin_unlock(ptl);
+
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+ if (page != pagecache_page)
+ unlock_page(page);
+ put_page(page);
+
+out_mutex:
+ mutex_unlock(&htlb_fault_mutex_table[hash]);
return ret;
}
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
+long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct page **pages, struct vm_area_struct **vmas,
+ unsigned long *position, unsigned long *nr_pages,
+ long i, unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
- int remainder = *length;
+ unsigned long remainder = *nr_pages;
+ struct hstate *h = hstate_vma(vma);
- spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
+ spinlock_t *ptl = NULL;
+ int absent;
struct page *page;
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make * sure we get the
+ * each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
+ *
+ * Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+ if (pte)
+ ptl = huge_pte_lock(h, mm, pte);
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ if (pte)
+ spin_unlock(ptl);
+ remainder = 0;
+ break;
+ }
- if (!pte || pte_none(*pte)) {
+ /*
+ * We need call hugetlb_fault for both hugepages under migration
+ * (in which case hugetlb_fault waits for the migration,) and
+ * hwpoisoned hugepages (in which case we need to prevent the
+ * caller from accessing to them.) In order to do this, we use
+ * here is_swap_pte instead of is_hugetlb_entry_migration and
+ * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+ * both cases, and because we can't follow correct pages
+ * directly from any kind of swap entries.
+ */
+ if (absent || is_swap_pte(huge_ptep_get(pte)) ||
+ ((flags & FOLL_WRITE) &&
+ !huge_pte_write(huge_ptep_get(pte)))) {
int ret;
- spin_unlock(&mm->page_table_lock);
- ret = hugetlb_fault(mm, vma, vaddr, 0);
- spin_lock(&mm->page_table_lock);
- if (ret == VM_FAULT_MINOR)
+ if (pte)
+ spin_unlock(ptl);
+ ret = hugetlb_fault(mm, vma, vaddr,
+ (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
+ if (!(ret & VM_FAULT_ERROR))
continue;
remainder = 0;
- if (!i)
- i = -EFAULT;
break;
}
- pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
- page = pte_page(*pte);
+ pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
+ page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
- get_page(page);
- pages[i] = page + pfn_offset;
+ pages[i] = mem_map_offset(page, pfn_offset);
+ get_page_foll(pages[i]);
}
if (vmas)
@@ -600,199 +3328,462 @@ same_page:
--remainder;
++i;
if (vaddr < vma->vm_end && remainder &&
- pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+ pfn_offset < pages_per_huge_page(h)) {
/*
* We use pfn_offset to avoid touching the pageframes
* of this compound page.
*/
goto same_page;
}
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
- *length = remainder;
+ *nr_pages = remainder;
*position = vaddr;
- return i;
+ return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
- spin_lock(&mm->page_table_lock);
- for (; address < end; address += HPAGE_SIZE) {
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ for (; address < end; address += huge_page_size(h)) {
+ spinlock_t *ptl;
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (!pte_none(*ptep)) {
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
+ spin_unlock(ptl);
+ continue;
+ }
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(pte_modify(pte, newprot));
+ pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte = arch_make_huge_pte(pte, vma, NULL, 0);
set_huge_pte_at(mm, address, ptep, pte);
- lazy_mmu_prot_update(pte);
+ pages++;
}
+ spin_unlock(ptl);
}
- spin_unlock(&mm->page_table_lock);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
-}
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ mmu_notifier_invalidate_range_end(mm, start, end);
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
+ return pages << h->order;
+}
-static long region_add(struct list_head *head, long f, long t)
+int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
- struct file_region *rg, *nrg, *trg;
+ long ret, chg;
+ struct hstate *h = hstate_inode(inode);
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ struct resv_map *resv_map;
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+ /*
+ * Only apply hugepage reservation if asked. At fault time, an
+ * attempt will be made for VM_NORESERVE to allocate a page
+ * without using reserves
+ */
+ if (vm_flags & VM_NORESERVE)
+ return 0;
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
+ /*
+ * Shared mappings base their reservation on the number of pages that
+ * are already allocated on behalf of the file. Private mappings need
+ * to reserve the full area even if read-only as mprotect() may be
+ * called to make the mapping read-write. Assume !vma is a shm mapping
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ resv_map = inode_resv_map(inode);
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
+ chg = region_chg(resv_map, from, to);
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- list_del(&rg->link);
- kfree(rg);
- }
+ } else {
+ resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
+
+ chg = to - from;
+
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- nrg->from = f;
- nrg->to = t;
+
+ if (chg < 0) {
+ ret = chg;
+ goto out_err;
+ }
+
+ /* There must be enough pages in the subpool for the mapping */
+ if (hugepage_subpool_get_pages(spool, chg)) {
+ ret = -ENOSPC;
+ goto out_err;
+ }
+
+ /*
+ * Check enough hugepages are available for the reservation.
+ * Hand the pages back to the subpool if there are not
+ */
+ ret = hugetlb_acct_memory(h, chg);
+ if (ret < 0) {
+ hugepage_subpool_put_pages(spool, chg);
+ goto out_err;
+ }
+
+ /*
+ * Account for the reservations made. Shared mappings record regions
+ * that have reservations as they are shared by multiple VMAs.
+ * When the last VMA disappears, the region map says how much
+ * the reservation was and the page cache tells how much of
+ * the reservation was consumed. Private mappings are per-VMA and
+ * only the consumed reservations are tracked. When the VMA
+ * disappears, the original reservation is the VMA size and the
+ * consumed reservations are stored in the map. Hence, nothing
+ * else has to be done for private mappings here
+ */
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_add(resv_map, from, to);
return 0;
+out_err:
+ if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_put(&resv_map->refs, resv_map_release);
+ return ret;
}
-static long region_chg(struct list_head *head, long f, long t)
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
- struct file_region *rg, *nrg;
+ struct hstate *h = hstate_inode(inode);
+ struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
+ struct hugepage_subpool *spool = subpool_inode(inode);
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+ if (resv_map)
+ chg = region_truncate(resv_map, offset);
+ spin_lock(&inode->i_lock);
+ inode->i_blocks -= (blocks_per_huge_page(h) * freed);
+ spin_unlock(&inode->i_lock);
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarentee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (nrg == 0)
- return -ENOMEM;
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- list_add(&nrg->link, rg->link.prev);
+ hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -(chg - freed));
+}
- return t - f;
- }
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+ struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t idx)
+{
+ unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+ svma->vm_start;
+ unsigned long sbase = saddr & PUD_MASK;
+ unsigned long s_end = sbase + PUD_SIZE;
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
+ /* Allow segments to share if only one is marked locked */
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- return chg;
+ /*
+ * match the virtual addresses, permission and the alignment of the
+ * page table page.
+ */
+ if (pmd_index(addr) != pmd_index(saddr) ||
+ vm_flags != svm_flags ||
+ sbase < svma->vm_start || svma->vm_end < s_end)
+ return 0;
- /* We overlap with this area, if it extends futher than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
+ return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long base = addr & PUD_MASK;
+ unsigned long end = base + PUD_SIZE;
+
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (vma->vm_flags & VM_MAYSHARE &&
+ vma->vm_start <= base && end <= vma->vm_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ struct vm_area_struct *vma = find_vma(mm, addr);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
+ struct vm_area_struct *svma;
+ unsigned long saddr;
+ pte_t *spte = NULL;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (!vma_shareable(vma, addr))
+ return (pte_t *)pmd_alloc(mm, pud, addr);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+ if (svma == vma)
+ continue;
+
+ saddr = page_table_shareable(svma, vma, addr, idx);
+ if (saddr) {
+ spte = huge_pte_offset(svma->vm_mm, saddr);
+ if (spte) {
+ get_page(virt_to_page(spte));
+ break;
+ }
}
- chg -= rg->to - rg->from;
}
- return chg;
+
+ if (!spte)
+ goto out;
+
+ ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
+ spin_lock(ptl);
+ if (pud_none(*pud))
+ pud_populate(mm, pud,
+ (pmd_t *)((unsigned long)spte & PAGE_MASK));
+ else
+ put_page(virt_to_page(spte));
+ spin_unlock(ptl);
+out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
}
-static long region_truncate(struct list_head *head, long end)
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with page table lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ * 0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
- struct file_region *rg, *trg;
- long chg = 0;
+ pgd_t *pgd = pgd_offset(mm, *addr);
+ pud_t *pud = pud_offset(pgd, *addr);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
- break;
- if (&rg->link == head)
+ BUG_ON(page_count(virt_to_page(ptep)) == 0);
+ if (page_count(virt_to_page(ptep)) == 1)
return 0;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
+ pud_clear(pud);
+ put_page(virt_to_page(ptep));
+ *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ return 1;
+}
+#define want_pmd_share() (1)
+#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+ return NULL;
+}
+#define want_pmd_share() (0)
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ if (want_pmd_share() && pud_none(*pud))
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ }
}
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
+ return pte;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, addr);
+ if (pud_present(*pud)) {
+ if (pud_huge(*pud))
+ return (pte_t *)pud;
+ pmd = pmd_offset(pud, addr);
+ }
}
- return chg;
+ return (pte_t *) pmd;
}
-static int hugetlb_acct_memory(long delta)
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int write)
{
- int ret = -ENOMEM;
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pmd);
+ if (page)
+ page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ struct page *page;
+
+ page = pte_page(*(pte_t *)pud);
+ if (page)
+ page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ return page;
+}
+
+#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+/* Can be overriden by architectures */
+struct page * __weak
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int write)
+{
+ BUG();
+ return NULL;
+}
+
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- if ((delta + resv_huge_pages) <= free_huge_pages) {
- resv_huge_pages += delta;
+ if (is_hugepage_on_freelist(hpage)) {
+ /*
+ * Hwpoisoned hugepage isn't linked to activelist or freelist,
+ * but dangling hpage->lru can trigger list-debug warnings
+ * (this happens when we call unpoison_memory() on it),
+ * so let it point to itself with list_del_init().
+ */
+ list_del_init(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
ret = 0;
}
spin_unlock(&hugetlb_lock);
return ret;
}
+#endif
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+bool isolate_huge_page(struct page *page, struct list_head *list)
{
- long ret, chg;
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ if (!get_page_unless_zero(page))
+ return false;
+ spin_lock(&hugetlb_lock);
+ list_move_tail(&page->lru, list);
+ spin_unlock(&hugetlb_lock);
+ return true;
+}
- chg = region_chg(&inode->i_mapping->private_list, from, to);
- if (chg < 0)
- return chg;
- ret = hugetlb_acct_memory(chg);
- if (ret < 0)
- return ret;
- region_add(&inode->i_mapping->private_list, from, to);
- return 0;
+void putback_active_hugepage(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ spin_lock(&hugetlb_lock);
+ list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ put_page(page);
}
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+bool is_hugepage_active(struct page *page)
{
- long chg = region_truncate(&inode->i_mapping->private_list, offset);
- hugetlb_acct_memory(freed - chg);
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
+ /*
+ * This function can be called for a tail page because the caller,
+ * scan_movable_pages, scans through a given pfn-range which typically
+ * covers one memory block. In systems using gigantic hugepage (1GB
+ * for x86_64,) a hugepage is larger than a memory block, and we don't
+ * support migrating such large hugepages for now, so return false
+ * when called for tail pages.
+ */
+ if (PageTail(page))
+ return false;
+ /*
+ * Refcount of a hwpoisoned hugepages is 1, but they are not active,
+ * so we should return false for them.
+ */
+ if (unlikely(PageHWPoison(page)))
+ return false;
+ return page_count(page) > 0;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 00000000000..493f758445e
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,409 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+struct hugetlb_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for hugepages from hugetlb.
+ */
+ struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+ return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
+}
+
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+ return (h_cg == root_h_cgroup);
+}
+
+static inline struct hugetlb_cgroup *
+parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
+{
+ return hugetlb_cgroup_from_css(h_cg->css.parent);
+}
+
+static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
+{
+ int idx;
+
+ for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+ return true;
+ }
+ return false;
+}
+
+static struct cgroup_subsys_state *
+hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
+ struct hugetlb_cgroup *h_cgroup;
+ int idx;
+
+ h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+ if (!h_cgroup)
+ return ERR_PTR(-ENOMEM);
+
+ if (parent_h_cgroup) {
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx],
+ &parent_h_cgroup->hugepage[idx]);
+ } else {
+ root_h_cgroup = h_cgroup;
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx], NULL);
+ }
+ return &h_cgroup->css;
+}
+
+static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct hugetlb_cgroup *h_cgroup;
+
+ h_cgroup = hugetlb_cgroup_from_css(css);
+ kfree(h_cgroup);
+}
+
+
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ int csize;
+ struct res_counter *counter;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
+
+ page_hcg = hugetlb_cgroup_from_page(page);
+ /*
+ * We can have pages in active list without any cgroup
+ * ie, hugepage with less than 3 pages. We can safely
+ * ignore those pages.
+ */
+ if (!page_hcg || page_hcg != h_cg)
+ goto out;
+
+ csize = PAGE_SIZE << compound_order(page);
+ if (!parent) {
+ parent = root_h_cgroup;
+ /* root has no limit */
+ res_counter_charge_nofail(&parent->hugepage[idx],
+ csize, &fail_res);
+ }
+ counter = &h_cg->hugepage[idx];
+ res_counter_uncharge_until(counter, counter->parent, csize);
+
+ set_hugetlb_cgroup(page, parent);
+out:
+ return;
+}
+
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ struct hstate *h;
+ struct page *page;
+ int idx = 0;
+
+ do {
+ for_each_hstate(h) {
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry(page, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(idx, h_cg, page);
+
+ spin_unlock(&hugetlb_lock);
+ idx++;
+ }
+ cond_resched();
+ } while (hugetlb_cgroup_have_usage(h_cg));
+}
+
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ int ret = 0;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *h_cg = NULL;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ goto done;
+ /*
+ * We don't charge any cgroup if the compound page have less
+ * than 3 pages.
+ */
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ goto done;
+again:
+ rcu_read_lock();
+ h_cg = hugetlb_cgroup_from_task(current);
+ if (!css_tryget_online(&h_cg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+ css_put(&h_cg->css);
+done:
+ *ptr = h_cg;
+ return ret;
+}
+
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ set_hugetlb_cgroup(page, h_cg);
+ return;
+}
+
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ struct hugetlb_cgroup *h_cg;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ return;
+ VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+ h_cg = hugetlb_cgroup_from_page(page);
+ if (unlikely(!h_cg))
+ return;
+ set_hugetlb_cgroup(page, NULL);
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ return;
+
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ int idx, name;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ return res_counter_read_u64(&h_cg->hugepage[idx], name);
+}
+
+static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ int idx, name, ret;
+ unsigned long long val;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+
+ buf = strstrip(buf);
+ idx = MEMFILE_IDX(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
+
+ switch (name) {
+ case RES_LIMIT:
+ if (hugetlb_cgroup_is_root(h_cg)) {
+ /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buf, &val);
+ if (ret)
+ break;
+ ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ int idx, name, ret = 0;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
+
+ idx = MEMFILE_IDX(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
+
+ switch (name) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&h_cg->hugepage[idx]);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+ if (hsize >= (1UL << 30))
+ snprintf(buf, size, "%luGB", hsize >> 30);
+ else if (hsize >= (1UL << 20))
+ snprintf(buf, size, "%luMB", hsize >> 20);
+ else
+ snprintf(buf, size, "%luKB", hsize >> 10);
+ return buf;
+}
+
+static void __init __hugetlb_cgroup_file_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, 32, huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->write = hugetlb_cgroup_write;
+
+ /* Add the usage file */
+ cft = &h->cgroup_files[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the MAX usage file */
+ cft = &h->cgroup_files[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* Add the failcntfile */
+ cft = &h->cgroup_files[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files[4];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
+
+ return;
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgroup details.
+ */
+ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+ __hugetlb_cgroup_file_init(hstate_index(h));
+ }
+}
+
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+ struct hugetlb_cgroup *h_cg;
+ struct hstate *h = page_hstate(oldhpage);
+
+ if (hugetlb_cgroup_disabled())
+ return;
+
+ VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
+ spin_lock(&hugetlb_lock);
+ h_cg = hugetlb_cgroup_from_page(oldhpage);
+ set_hugetlb_cgroup(oldhpage, NULL);
+
+ /* move the h_cg details to new cgroup */
+ set_hugetlb_cgroup(newhpage, h_cg);
+ list_move(&newhpage->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ return;
+}
+
+struct cgroup_subsys hugetlb_cgrp_subsys = {
+ .css_alloc = hugetlb_cgroup_css_alloc,
+ .css_offline = hugetlb_cgroup_css_offline,
+ .css_free = hugetlb_cgroup_css_free,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 00000000000..95487c71cad
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,142 @@
+/* Inject a hwpoison memory failure on a arbitrary pfn */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
+
+static struct dentry *hwpoison_dir;
+
+static int hwpoison_inject(void *data, u64 val)
+{
+ unsigned long pfn = val;
+ struct page *p;
+ struct page *hpage;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+
+ p = pfn_to_page(pfn);
+ hpage = compound_head(p);
+ /*
+ * This implies unable to support free buddy pages.
+ */
+ if (!get_page_unless_zero(hpage))
+ return 0;
+
+ if (!hwpoison_filter_enable)
+ goto inject;
+
+ if (!PageLRU(p) && !PageHuge(p))
+ shake_page(p, 0);
+ /*
+ * This implies unable to support non-LRU pages.
+ */
+ if (!PageLRU(p) && !PageHuge(p))
+ return 0;
+
+ /*
+ * do a racy check with elevated page count, to make sure PG_hwpoison
+ * will only be set for the targeted owner (or on a free page).
+ * We temporarily take page lock for try_get_mem_cgroup_from_page().
+ * memory_failure() will redo the check reliably inside page lock.
+ */
+ lock_page(hpage);
+ err = hwpoison_filter(hpage);
+ unlock_page(hpage);
+ if (err)
+ return 0;
+
+inject:
+ pr_info("Injecting memory failure at pfn %#lx\n", pfn);
+ return memory_failure(pfn, 18, MF_COUNT_INCREASED);
+}
+
+static int hwpoison_unpoison(void *data, u64 val)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return unpoison_memory(val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
+
+static void pfn_inject_exit(void)
+{
+ if (hwpoison_dir)
+ debugfs_remove_recursive(hwpoison_dir);
+}
+
+static int pfn_inject_init(void)
+{
+ struct dentry *dentry;
+
+ hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
+ if (hwpoison_dir == NULL)
+ return -ENOMEM;
+
+ /*
+ * Note that the below poison/unpoison interfaces do not involve
+ * hardware status change, hence do not require hardware support.
+ * They are mainly for testing hwpoison in software level.
+ */
+ dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
+ NULL, &hwpoison_fops);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
+ NULL, &unpoison_fops);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
+ hwpoison_dir, &hwpoison_filter_enable);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
+ hwpoison_dir, &hwpoison_filter_dev_major);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
+ hwpoison_dir, &hwpoison_filter_dev_minor);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
+ hwpoison_dir, &hwpoison_filter_flags_mask);
+ if (!dentry)
+ goto fail;
+
+ dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
+ hwpoison_dir, &hwpoison_filter_flags_value);
+ if (!dentry)
+ goto fail;
+
+#ifdef CONFIG_MEMCG_SWAP
+ dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
+ hwpoison_dir, &hwpoison_filter_memcg);
+ if (!dentry)
+ goto fail;
+#endif
+
+ return 0;
+fail:
+ pfn_inject_exit();
+ return -ENOMEM;
+}
+
+module_init(pfn_inject_init);
+module_exit(pfn_inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/mm/init-mm.c b/mm/init-mm.c
new file mode 100644
index 00000000000..a56a851908d
--- /dev/null
+++ b/mm/init-mm.c
@@ -0,0 +1,25 @@
+#include <linux/mm_types.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpumask.h>
+
+#include <linux/atomic.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
+
+struct mm_struct init_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ INIT_MM_CONTEXT(init_mm)
+};
diff --git a/mm/internal.h b/mm/internal.h
index d527b80b292..7f22a11fcc6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,30 +11,364 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
static inline void set_page_count(struct page *page, int v)
{
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
- VM_BUG_ON(atomic_read(&page->_count));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
set_page_count(page, 1);
}
-static inline void __put_page(struct page *page)
+static inline void __get_page_tail_foll(struct page *page,
+ bool get_page_head)
+{
+ /*
+ * If we're getting a tail page, the elevated page->_count is
+ * required only in the head page and we will elevate the head
+ * page->_count and tail page->_mapcount.
+ *
+ * We elevate page_tail->_mapcount for tail pages to force
+ * page_tail->_count to be zero at all times to avoid getting
+ * false positives from get_page_unless_zero() with
+ * speculative page access (like in
+ * page_cache_get_speculative()) on tail pages.
+ */
+ VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
+ if (get_page_head)
+ atomic_inc(&page->first_page->_count);
+ get_huge_page_tail(page);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ /*
+ * This is safe only because
+ * __split_huge_page_refcount() can't run under
+ * get_page_foll() because we hold the proper PT lock.
+ */
+ __get_page_tail_foll(page, true);
+ else {
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count.
+ */
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+ atomic_inc(&page->_count);
+ }
+}
+
+extern unsigned long highest_memmap_pfn;
+
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+extern bool zone_reclaimable(struct zone *zone);
+
+/*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
+ * in mm/page_alloc.c
+ */
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void prep_compound_page(struct page *page, unsigned long order);
+#ifdef CONFIG_MEMORY_FAILURE
+extern bool is_free_buddy_page(struct page *page);
+#endif
+extern int user_min_free_kbytes;
+
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+
+/*
+ * in mm/compaction.c
+ */
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+ struct list_head freepages; /* List of free pages to migrate to */
+ struct list_head migratepages; /* List of pages being migrated */
+ unsigned long nr_freepages; /* Number of isolated free pages */
+ unsigned long nr_migratepages; /* Number of pages to migrate */
+ unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long migrate_pfn; /* isolate_migratepages search base */
+ enum migrate_mode mode; /* Async or sync migration mode */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool finished_update_free; /* True when the zone cached pfns are
+ * no longer being updated
+ */
+ bool finished_update_migrate;
+
+ int order; /* order a direct compactor needs */
+ int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ struct zone *zone;
+ bool contended; /* True if a lock was contended, or
+ * need_resched() true during async
+ * compaction
+ */
+};
+
+unsigned long
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn);
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
+
+#endif
+
+/*
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+ /* PageBuddy() must be checked by the caller */
+ return page_private(page);
+}
+
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent);
+
+#ifdef CONFIG_MMU
+extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+ munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+/*
+ * must be called with vma's mmap_sem held for read or write, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+extern unsigned int munlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked(). This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void clear_page_mlock(struct page *page);
+
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+ if (TestClearPageMlocked(page)) {
+ unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
+
+ local_irq_save(flags);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ SetPageMlocked(newpage);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
+ local_irq_restore(flags);
+ }
+}
+
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
+#else /* !CONFIG_MMU */
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+
+#endif /* !CONFIG_MMU */
+
+/*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'. Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
{
- atomic_dec(&page->_count);
+ if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+ return pfn_to_page(page_to_pfn(base) + offset);
+ return base + offset;
}
-extern void fastcall __init __free_pages_bootmem(struct page *page,
- unsigned int order);
+/*
+ * Iterator over all subpages within the maximally aligned gigantic
+ * page 'base'. Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+ struct page *base, int offset)
+{
+ if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+ unsigned long pfn = page_to_pfn(base) + offset;
+ if (!pfn_valid(pfn))
+ return NULL;
+ return pfn_to_page(pfn);
+ }
+ return iter + 1;
+}
+/*
+ * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
+ * so all functions starting at paging_init should be marked __init
+ * in those cases. SPARSEMEM, however, allows for memory hotplug,
+ * and alloc_bootmem_node is not used.
+ */
+#ifdef CONFIG_SPARSEMEM
+#define __paginginit __meminit
+#else
+#define __paginginit __init
#endif
+
+/* Memory initialisation debug and verification */
+enum mminit_level {
+ MMINIT_WARNING,
+ MMINIT_VERIFY,
+ MMINIT_TRACE
+};
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+
+extern int mminit_loglevel;
+
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+ if (level < mminit_loglevel) { \
+ printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+ printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ } \
+} while (0)
+
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+
+#else
+
+static inline void mminit_dprintk(enum mminit_level level,
+ const char *prefix, const char *fmt, ...)
+{
+}
+
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+
+static inline void mminit_verify_page_links(struct page *page,
+ enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+
+#define ZONE_RECLAIM_NOSCAN -2
+#define ZONE_RECLAIM_FULL -1
+#define ZONE_RECLAIM_SOME 0
+#define ZONE_RECLAIM_SUCCESS 1
+
+extern int hwpoison_filter(struct page *p);
+
+extern u32 hwpoison_filter_dev_major;
+extern u32 hwpoison_filter_dev_minor;
+extern u64 hwpoison_filter_flags_mask;
+extern u64 hwpoison_filter_flags_value;
+extern u64 hwpoison_filter_memcg;
+extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN WMARK_MIN
+#define ALLOC_WMARK_LOW WMARK_LOW
+#define ALLOC_WMARK_HIGH WMARK_HIGH
+#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR 0x100 /* fair zone allocation */
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 00000000000..4a5822a586e
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+ unsigned long, shared.linear.rb_subtree_last,
+ vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+ struct vm_area_struct *prev,
+ struct rb_root *root)
+{
+ struct rb_node **link;
+ struct vm_area_struct *parent;
+ unsigned long last = vma_last_pgoff(node);
+
+ VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+
+ if (!prev->shared.linear.rb.rb_right) {
+ parent = prev;
+ link = &prev->shared.linear.rb.rb_right;
+ } else {
+ parent = rb_entry(prev->shared.linear.rb.rb_right,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ while (parent->shared.linear.rb.rb_left) {
+ parent = rb_entry(parent->shared.linear.rb.rb_left,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ }
+ link = &parent->shared.linear.rb.rb_left;
+ }
+
+ node->shared.linear.rb_subtree_last = last;
+ rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+ rb_insert_augmented(&node->shared.linear.rb, root,
+ &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+ avc_start_pgoff, avc_last_pgoff,
+ static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+ node->cached_vma_start = avc_start_pgoff(node);
+ node->cached_vma_last = avc_last_pgoff(node);
+#endif
+ __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+ __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+ WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+ WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
new file mode 100644
index 00000000000..7b5dbd1517b
--- /dev/null
+++ b/mm/iov_iter.c
@@ -0,0 +1,743 @@
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+ void *kaddr, *from;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ if (!fault_in_pages_writeable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+
+ /* first chunk, usually the only one */
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ if (likely(!bytes)) {
+ kunmap_atomic(kaddr);
+ goto done;
+ }
+ offset = from - kaddr;
+ buf += copy;
+ kunmap_atomic(kaddr);
+ copy = min(bytes, iov->iov_len - skip);
+ }
+ /* Too bad - revert to non-atomic kmap */
+ kaddr = kmap(page);
+ from = kaddr + offset;
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ kunmap(page);
+done:
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+ void *kaddr, *to;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ if (!fault_in_pages_readable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ to = kaddr + offset;
+
+ /* first chunk, usually the only one */
+ left = __copy_from_user_inatomic(to, buf, copy);
+ copy -= left;
+ skip += copy;
+ to += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_from_user_inatomic(to, buf, copy);
+ copy -= left;
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ if (likely(!bytes)) {
+ kunmap_atomic(kaddr);
+ goto done;
+ }
+ offset = to - kaddr;
+ buf += copy;
+ kunmap_atomic(kaddr);
+ copy = min(bytes, iov->iov_len - skip);
+ }
+ /* Too bad - revert to non-atomic kmap */
+ kaddr = kmap(page);
+ to = kaddr + offset;
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip += copy;
+ to += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ kunmap(page);
+done:
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t copy_from_user_atomic_iovec(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+
+static void advance_iovec(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ if (!(i->type & ITER_BVEC)) {
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+static unsigned long alignment_iovec(const struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ unsigned long res;
+ size_t size = i->count;
+ size_t n;
+
+ if (!size)
+ return 0;
+
+ res = (unsigned long)iov->iov_base + i->iov_offset;
+ n = iov->iov_len - i->iov_offset;
+ if (n >= size)
+ return res | size;
+ size -= n;
+ res |= n;
+ while (size > (++iov)->iov_len) {
+ res |= (unsigned long)iov->iov_base | iov->iov_len;
+ size -= iov->iov_len;
+ }
+ res |= (unsigned long)iov->iov_base | size;
+ return res;
+}
+
+void iov_iter_init(struct iov_iter *i, int direction,
+ const struct iovec *iov, unsigned long nr_segs,
+ size_t count)
+{
+ /* It will get better. Eventually... */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ direction |= ITER_KVEC;
+ i->type = direction;
+ i->iov = iov;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_init);
+
+static ssize_t get_pages_iovec(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ size_t offset = i->iov_offset;
+ const struct iovec *iov = i->iov;
+ size_t len;
+ unsigned long addr;
+ int n;
+ int res;
+
+ len = iov->iov_len - offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ addr = (unsigned long)iov->iov_base + offset;
+ len += *start = addr & (PAGE_SIZE - 1);
+ addr &= ~(PAGE_SIZE - 1);
+ n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+ if (unlikely(res < 0))
+ return res;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+
+static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ size_t offset = i->iov_offset;
+ const struct iovec *iov = i->iov;
+ size_t len;
+ unsigned long addr;
+ void *p;
+ int n;
+ int res;
+
+ len = iov->iov_len - offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ addr = (unsigned long)iov->iov_base + offset;
+ len += *start = addr & (PAGE_SIZE - 1);
+ addr &= ~(PAGE_SIZE - 1);
+ n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+ if (!p)
+ p = vmalloc(n * sizeof(struct page *));
+ if (!p)
+ return -ENOMEM;
+
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+ if (unlikely(res < 0)) {
+ kvfree(p);
+ return res;
+ }
+ *pages = p;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+
+static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
+{
+ size_t offset = i->iov_offset;
+ size_t size = i->count;
+ const struct iovec *iov = i->iov;
+ int npages = 0;
+ int n;
+
+ for (n = 0; size && n < i->nr_segs; n++, iov++) {
+ unsigned long addr = (unsigned long)iov->iov_base + offset;
+ size_t len = iov->iov_len - offset;
+ offset = 0;
+ if (unlikely(!len)) /* empty segment */
+ continue;
+ if (len > size)
+ len = size;
+ npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
+ - addr / PAGE_SIZE;
+ if (npages >= maxpages) /* don't bother going further */
+ return maxpages;
+ size -= len;
+ offset = 0;
+ }
+ return min(npages, maxpages);
+}
+
+static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
+{
+ char *from = kmap_atomic(page);
+ memcpy(to, from + offset, len);
+ kunmap_atomic(from);
+}
+
+static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
+{
+ char *to = kmap_atomic(page);
+ memcpy(to + offset, from, len);
+ kunmap_atomic(to);
+}
+
+static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, wanted;
+ const struct bio_vec *bvec;
+ void *kaddr, *from;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ bvec = i->bvec;
+ skip = i->iov_offset;
+ copy = min_t(size_t, bytes, bvec->bv_len - skip);
+
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+ memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (bytes) {
+ bvec++;
+ copy = min(bytes, (size_t)bvec->bv_len);
+ memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ kunmap_atomic(kaddr);
+ if (skip == bvec->bv_len) {
+ bvec++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= bvec - i->bvec;
+ i->bvec = bvec;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, wanted;
+ const struct bio_vec *bvec;
+ void *kaddr, *to;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ bvec = i->bvec;
+ skip = i->iov_offset;
+
+ kaddr = kmap_atomic(page);
+
+ to = kaddr + offset;
+
+ copy = min(bytes, bvec->bv_len - skip);
+
+ memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
+
+ to += copy;
+ skip += copy;
+ bytes -= copy;
+
+ while (bytes) {
+ bvec++;
+ copy = min(bytes, (size_t)bvec->bv_len);
+ memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ kunmap_atomic(kaddr);
+ if (skip == bvec->bv_len) {
+ bvec++;
+ skip = 0;
+ }
+ i->count -= wanted;
+ i->nr_segs -= bvec - i->bvec;
+ i->bvec = bvec;
+ i->iov_offset = skip;
+ return wanted;
+}
+
+static size_t copy_from_user_bvec(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t left;
+ const struct bio_vec *bvec;
+ size_t base = i->iov_offset;
+
+ kaddr = kmap_atomic(page);
+ for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
+ size_t copy = min(left, bvec->bv_len - base);
+ if (!bvec->bv_len)
+ continue;
+ memcpy_from_page(kaddr + offset, bvec->bv_page,
+ bvec->bv_offset + base, copy);
+ offset += copy;
+ left -= copy;
+ }
+ kunmap_atomic(kaddr);
+ return bytes;
+}
+
+static void advance_bvec(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct bio_vec *bvec = i->bvec;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !bvec->bv_len)) {
+ int copy;
+
+ copy = min(bytes, bvec->bv_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (bvec->bv_len == base) {
+ bvec++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->bvec = bvec;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+
+static unsigned long alignment_bvec(const struct iov_iter *i)
+{
+ const struct bio_vec *bvec = i->bvec;
+ unsigned long res;
+ size_t size = i->count;
+ size_t n;
+
+ if (!size)
+ return 0;
+
+ res = bvec->bv_offset + i->iov_offset;
+ n = bvec->bv_len - i->iov_offset;
+ if (n >= size)
+ return res | size;
+ size -= n;
+ res |= n;
+ while (size > (++bvec)->bv_len) {
+ res |= bvec->bv_offset | bvec->bv_len;
+ size -= bvec->bv_len;
+ }
+ res |= bvec->bv_offset | size;
+ return res;
+}
+
+static ssize_t get_pages_bvec(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t len = bvec->bv_len - i->iov_offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ *start = bvec->bv_offset + i->iov_offset;
+
+ get_page(*pages = bvec->bv_page);
+
+ return len;
+}
+
+static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t len = bvec->bv_len - i->iov_offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ *start = bvec->bv_offset + i->iov_offset;
+
+ *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
+ if (!*pages)
+ return -ENOMEM;
+
+ get_page(**pages = bvec->bv_page);
+
+ return len;
+}
+
+static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
+{
+ size_t offset = i->iov_offset;
+ size_t size = i->count;
+ const struct bio_vec *bvec = i->bvec;
+ int npages = 0;
+ int n;
+
+ for (n = 0; size && n < i->nr_segs; n++, bvec++) {
+ size_t len = bvec->bv_len - offset;
+ offset = 0;
+ if (unlikely(!len)) /* empty segment */
+ continue;
+ if (len > size)
+ len = size;
+ npages++;
+ if (npages >= maxpages) /* don't bother going further */
+ return maxpages;
+ size -= len;
+ offset = 0;
+ }
+ return min(npages, maxpages);
+}
+
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_page_to_iter_bvec(page, offset, bytes, i);
+ else
+ return copy_page_to_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
+size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_page_from_iter_bvec(page, offset, bytes, i);
+ else
+ return copy_page_from_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_from_iter);
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ if (i->type & ITER_BVEC)
+ return copy_from_user_bvec(page, i, offset, bytes);
+ else
+ return copy_from_user_atomic_iovec(page, i, offset, bytes);
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+void iov_iter_advance(struct iov_iter *i, size_t size)
+{
+ if (i->type & ITER_BVEC)
+ advance_bvec(i, size);
+ else
+ advance_iovec(i, size);
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+ if (i->nr_segs == 1)
+ return i->count;
+ else if (i->type & ITER_BVEC)
+ return min(i->count, i->iov->iov_len - i->iov_offset);
+ else
+ return min(i->count, i->bvec->bv_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+unsigned long iov_iter_alignment(const struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return alignment_bvec(i);
+ else
+ return alignment_iovec(i);
+}
+EXPORT_SYMBOL(iov_iter_alignment);
+
+ssize_t iov_iter_get_pages(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ if (i->type & ITER_BVEC)
+ return get_pages_bvec(i, pages, maxsize, start);
+ else
+ return get_pages_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages);
+
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ if (i->type & ITER_BVEC)
+ return get_pages_alloc_bvec(i, pages, maxsize, start);
+ else
+ return get_pages_alloc_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+
+int iov_iter_npages(const struct iov_iter *i, int maxpages)
+{
+ if (i->type & ITER_BVEC)
+ return iov_iter_npages_bvec(i, maxpages);
+ else
+ return iov_iter_npages_iovec(i, maxpages);
+}
+EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
new file mode 100644
index 00000000000..fd814fd6131
--- /dev/null
+++ b/mm/kmemcheck.c
@@ -0,0 +1,122 @@
+#include <linux/gfp.h>
+#include <linux/mm_types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmemcheck.h>
+
+void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ pages = 1 << order;
+
+ /*
+ * With kmemcheck enabled, we need to allocate a memory area for the
+ * shadow bits as well.
+ */
+ shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+ if (!shadow) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "kmemcheck: failed to allocate "
+ "shadow bitmap\n");
+ return;
+ }
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = page_address(&shadow[i]);
+
+ /*
+ * Mark it as non-present for the MMU so that our accesses to
+ * this memory will trigger a page fault and let us analyze
+ * the memory accesses.
+ */
+ kmemcheck_hide_pages(page, pages);
+}
+
+void kmemcheck_free_shadow(struct page *page, int order)
+{
+ struct page *shadow;
+ int pages;
+ int i;
+
+ if (!kmemcheck_page_is_tracked(page))
+ return;
+
+ pages = 1 << order;
+
+ kmemcheck_show_pages(page, pages);
+
+ shadow = virt_to_page(page[0].shadow);
+
+ for(i = 0; i < pages; ++i)
+ page[i].shadow = NULL;
+
+ __free_pages(shadow, order);
+}
+
+void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
+ size_t size)
+{
+ /*
+ * Has already been memset(), which initializes the shadow for us
+ * as well.
+ */
+ if (gfpflags & __GFP_ZERO)
+ return;
+
+ /* No need to initialize the shadow of a non-tracked slab. */
+ if (s->flags & SLAB_NOTRACK)
+ return;
+
+ if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
+ /*
+ * Allow notracked objects to be allocated from
+ * tracked caches. Note however that these objects
+ * will still get page faults on access, they just
+ * won't ever be flagged as uninitialized. If page
+ * faults are not acceptable, the slab cache itself
+ * should be marked NOTRACK.
+ */
+ kmemcheck_mark_initialized(object, size);
+ } else if (!s->ctor) {
+ /*
+ * New objects should be marked uninitialized before
+ * they're returned to the called.
+ */
+ kmemcheck_mark_uninitialized(object, size);
+ }
+}
+
+void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
+{
+ /* TODO: RCU freeing is unsupported for now; hide false positives. */
+ if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+ kmemcheck_mark_freed(object, size);
+}
+
+void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
+ gfp_t gfpflags)
+{
+ int pages;
+
+ if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
+ return;
+
+ pages = 1 << order;
+
+ /*
+ * NOTE: We choose to track GFP_ZERO pages too; in fact, they
+ * can become uninitialized by copying uninitialized memory
+ * into them.
+ */
+
+ /* XXX: Can use zone->node for node? */
+ kmemcheck_alloc_shadow(page, order, gfpflags, -1);
+
+ if (gfpflags & __GFP_ZERO)
+ kmemcheck_mark_initialized_pages(page, pages);
+ else
+ kmemcheck_mark_uninitialized_pages(page, pages);
+}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 00000000000..dcdcadb6953
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
+/*
+ * mm/kmemleak-test.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define pr_fmt(fmt) "kmemleak: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/fdtable.h>
+
+#include <linux/kmemleak.h>
+
+struct test_node {
+ long header[25];
+ struct list_head list;
+ long footer[25];
+};
+
+static LIST_HEAD(test_list);
+static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
+
+/*
+ * Some very simple testing. This function needs to be extended for
+ * proper testing.
+ */
+static int __init kmemleak_test_init(void)
+{
+ struct test_node *elem;
+ int i;
+
+ printk(KERN_INFO "Kmemleak testing\n");
+
+ /* make some orphan objects */
+ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+#ifndef CONFIG_MODULES
+ pr_info("kmem_cache_alloc(files_cachep) = %p\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+ pr_info("kmem_cache_alloc(files_cachep) = %p\n",
+ kmem_cache_alloc(files_cachep, GFP_KERNEL));
+#endif
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+
+ /*
+ * Add elements to a list. They should only appear as orphan
+ * after the module is removed.
+ */
+ for (i = 0; i < 10; i++) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+ pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
+ if (!elem)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&elem->list);
+ list_add_tail(&elem->list, &test_list);
+ }
+
+ for_each_possible_cpu(i) {
+ per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
+ pr_info("kmalloc(129) = %p\n",
+ per_cpu(kmemleak_test_pointer, i));
+ }
+
+ return 0;
+}
+module_init(kmemleak_test_init);
+
+static void __exit kmemleak_test_exit(void)
+{
+ struct test_node *elem, *tmp;
+
+ /*
+ * Remove the list elements without actually freeing the
+ * memory.
+ */
+ list_for_each_entry_safe(elem, tmp, &test_list, list)
+ list_del(&elem->list);
+}
+module_exit(kmemleak_test_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 00000000000..3cda50c1e39
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1920 @@
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/kmemleak.txt.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * accesses to the object_tree_root. The object_list is the main list
+ * holding the metadata (struct kmemleak_object) for the allocated memory
+ * blocks. The object_tree_root is a red black tree used to look-up
+ * metadata based on a pointer to the corresponding memory block. The
+ * kmemleak_object structures are added to the object_list and
+ * object_tree_root in the create_object() function called from the
+ * kmemleak_alloc() callback and removed in delete_object() called from the
+ * kmemleak_free() callback
+ * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
+ * the metadata (e.g. count) are protected by this lock. Note that some
+ * members of this structure may be protected by other means (atomic or
+ * kmemleak_lock). This lock is also held when scanning the corresponding
+ * memory block to avoid the kernel freeing it via the kmemleak_free()
+ * callback. This is less heavyweight than holding a global lock like
+ * kmemleak_lock during scanning
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ * unreferenced objects at a time. The gray_list contains the objects which
+ * are already referenced or marked as false positives and need to be
+ * scanned. This list is only modified during a scanning episode when the
+ * scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ * Note that the kmemleak_object.use_count is incremented when an object is
+ * added to the gray_list and therefore cannot be freed. This mutex also
+ * prevents multiple users of the "kmemleak" debugfs file together with
+ * modifications to the memory scanning parameters including the scan_thread
+ * pointer
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+#include <linux/workqueue.h>
+#include <linux/crc32.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <linux/atomic.h>
+
+#include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
+#include <linux/memory_hotplug.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE 16 /* stack trace length */
+#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
+#define SECS_FIRST_SCAN 60 /* delay before the first scan */
+#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
+#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
+
+#define BYTES_PER_POINTER sizeof(void *)
+
+/* GFP bitmask for kmemleak internal allocations */
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+ __GFP_NORETRY | __GFP_NOMEMALLOC | \
+ __GFP_NOWARN)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+ struct hlist_node node;
+ unsigned long start;
+ size_t size;
+};
+
+#define KMEMLEAK_GREY 0
+#define KMEMLEAK_BLACK -1
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * rb_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+ spinlock_t lock;
+ unsigned long flags; /* object status flags */
+ struct list_head object_list;
+ struct list_head gray_list;
+ struct rb_node rb_node;
+ struct rcu_head rcu; /* object_list lockless traversal */
+ /* object usage count; object freed when use_count == 0 */
+ atomic_t use_count;
+ unsigned long pointer;
+ size_t size;
+ /* minimum number of a pointers found before it is considered leak */
+ int min_count;
+ /* the total number of pointers found pointing to this object */
+ int count;
+ /* checksum for detecting modified objects */
+ u32 checksum;
+ /* memory ranges to be scanned inside an object (empty for all) */
+ struct hlist_head area_list;
+ unsigned long trace[MAX_TRACE];
+ unsigned int trace_len;
+ unsigned long jiffies; /* creation timestamp */
+ pid_t pid; /* pid of the current task */
+ char comm[TASK_COMM_LEN]; /* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED (1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED (1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN (1 << 2)
+
+/* number of bytes to print per line; must be 16 or 32 */
+#define HEX_ROW_SIZE 16
+/* number of bytes to print at a time (1, 2, 4, 8) */
+#define HEX_GROUP_SIZE 1
+/* include ASCII after the hex output */
+#define HEX_ASCII 1
+/* max number of lines to be printed */
+#define HEX_MAX_LINES 2
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
+static DEFINE_RWLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static int kmemleak_enabled;
+/* set in the late_initcall if there were no errors */
+static int kmemleak_initialized;
+/* enables or disables early logging of the memory operations */
+static int kmemleak_early_log = 1;
+/* set if a kmemleak warning was issued */
+static int kmemleak_warning;
+/* set if a fatal kmemleak error has occurred */
+static int kmemleak_error;
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+static struct task_struct *scan_thread;
+/* used to avoid reporting of recently allocated objects */
+static unsigned long jiffies_min_age;
+static unsigned long jiffies_last_scan;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan = 1;
+/* protects the memory scanning, parameters and debug/kmemleak file access */
+static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+/* If there are leaks that can be reported */
+static bool kmemleak_found_leaks;
+
+/*
+ * Early object allocation/freeing logging. Kmemleak is initialized after the
+ * kernel allocator. However, both the kernel allocator and kmemleak may
+ * allocate memory blocks which need to be tracked. Kmemleak defines an
+ * arbitrary buffer to hold the allocation/freeing information before it is
+ * fully initialized.
+ */
+
+/* kmemleak operation type for early logging */
+enum {
+ KMEMLEAK_ALLOC,
+ KMEMLEAK_ALLOC_PERCPU,
+ KMEMLEAK_FREE,
+ KMEMLEAK_FREE_PART,
+ KMEMLEAK_FREE_PERCPU,
+ KMEMLEAK_NOT_LEAK,
+ KMEMLEAK_IGNORE,
+ KMEMLEAK_SCAN_AREA,
+ KMEMLEAK_NO_SCAN
+};
+
+/*
+ * Structure holding the information passed to kmemleak callbacks during the
+ * early logging.
+ */
+struct early_log {
+ int op_type; /* kmemleak operation type */
+ const void *ptr; /* allocated/freed memory block */
+ size_t size; /* memory block size */
+ int min_count; /* minimum reference count */
+ unsigned long trace[MAX_TRACE]; /* stack trace */
+ unsigned int trace_len; /* stack trace length */
+};
+
+/* early logging buffer and current position */
+static struct early_log
+ early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
+static int crt_early_log __initdata;
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...) do { \
+ pr_warning(x); \
+ dump_stack(); \
+ kmemleak_warning = 1; \
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
+ * recovered from. Kmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_stop(x...) do { \
+ kmemleak_warn(x); \
+ kmemleak_disable(); \
+} while (0)
+
+/*
+ * Printing of the objects hex dump to the seq file. The number of lines to be
+ * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
+ * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
+ * with the object->lock held.
+ */
+static void hex_dump_object(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ const u8 *ptr = (const u8 *)object->pointer;
+ int i, len, remaining;
+ unsigned char linebuf[HEX_ROW_SIZE * 5];
+
+ /* limit the number of lines to HEX_MAX_LINES */
+ remaining = len =
+ min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
+
+ seq_printf(seq, " hex dump (first %d bytes):\n", len);
+ for (i = 0; i < len; i += HEX_ROW_SIZE) {
+ int linelen = min(remaining, HEX_ROW_SIZE);
+
+ remaining -= HEX_ROW_SIZE;
+ hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
+ HEX_ASCII);
+ seq_printf(seq, " %s\n", linebuf);
+ }
+}
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray - not orphan, not marked as false positive (min_count == 0) or
+ * sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ * (min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static bool color_white(const struct kmemleak_object *object)
+{
+ return object->count != KMEMLEAK_BLACK &&
+ object->count < object->min_count;
+}
+
+static bool color_gray(const struct kmemleak_object *object)
+{
+ return object->min_count != KMEMLEAK_BLACK &&
+ object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static bool unreferenced_object(struct kmemleak_object *object)
+{
+ return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
+ time_before_eq(object->jiffies + jiffies_min_age,
+ jiffies_last_scan);
+}
+
+/*
+ * Printing of the unreferenced objects information to the seq file. The
+ * print_unreferenced function must be called with the object->lock held.
+ */
+static void print_unreferenced(struct seq_file *seq,
+ struct kmemleak_object *object)
+{
+ int i;
+ unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
+
+ seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ object->comm, object->pid, object->jiffies,
+ msecs_age / 1000, msecs_age % 1000);
+ hex_dump_object(seq, object);
+ seq_printf(seq, " backtrace:\n");
+
+ for (i = 0; i < object->trace_len; i++) {
+ void *ptr = (void *)object->trace[i];
+ seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ }
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+ struct stack_trace trace;
+
+ trace.nr_entries = object->trace_len;
+ trace.entries = object->trace;
+
+ pr_notice("Object 0x%08lx (size %zu):\n",
+ object->pointer, object->size);
+ pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
+ object->comm, object->pid, object->jiffies);
+ pr_notice(" min_count = %d\n", object->min_count);
+ pr_notice(" count = %d\n", object->count);
+ pr_notice(" flags = 0x%lx\n", object->flags);
+ pr_notice(" checksum = %u\n", object->checksum);
+ pr_notice(" backtrace:\n");
+ print_stack_trace(&trace, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the object search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ struct rb_node *rb = object_tree_root.rb_node;
+
+ while (rb) {
+ struct kmemleak_object *object =
+ rb_entry(rb, struct kmemleak_object, rb_node);
+ if (ptr < object->pointer)
+ rb = object->rb_node.rb_left;
+ else if (object->pointer + object->size <= ptr)
+ rb = object->rb_node.rb_right;
+ else if (object->pointer == ptr || alias)
+ return object;
+ else {
+ kmemleak_warn("Found object by alias at 0x%08lx\n",
+ ptr);
+ dump_object_info(object);
+ break;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+ return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+ struct hlist_node *tmp;
+ struct kmemleak_scan_area *area;
+ struct kmemleak_object *object =
+ container_of(rcu, struct kmemleak_object, rcu);
+
+ /*
+ * Once use_count is 0 (guaranteed by put_object), there is no other
+ * code accessing this object, hence no need for locking.
+ */
+ hlist_for_each_entry_safe(area, tmp, &object->area_list, node) {
+ hlist_del(&area->node);
+ kmem_cache_free(scan_area_cache, area);
+ }
+ kmem_cache_free(object_cache, object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+ if (!atomic_dec_and_test(&object->use_count))
+ return;
+
+ /* should only get here after delete_object was called */
+ WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+ call_rcu(&object->rcu, free_object_rcu);
+}
+
+/*
+ * Look up an object in the object search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ unsigned long flags;
+ struct kmemleak_object *object = NULL;
+
+ rcu_read_lock();
+ read_lock_irqsave(&kmemleak_lock, flags);
+ if (ptr >= min_addr && ptr < max_addr)
+ object = lookup_object(ptr, alias);
+ read_unlock_irqrestore(&kmemleak_lock, flags);
+
+ /* check whether the object is still available */
+ if (object && !get_object(object))
+ object = NULL;
+ rcu_read_unlock();
+
+ return object;
+}
+
+/*
+ * Save stack trace to the given array of MAX_TRACE size.
+ */
+static int __save_stack_trace(unsigned long *trace)
+{
+ struct stack_trace stack_trace;
+
+ stack_trace.max_entries = MAX_TRACE;
+ stack_trace.nr_entries = 0;
+ stack_trace.entries = trace;
+ stack_trace.skip = 2;
+ save_stack_trace(&stack_trace);
+
+ return stack_trace.nr_entries;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object, *parent;
+ struct rb_node **link, *rb_parent;
+
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (!object) {
+ pr_warning("Cannot allocate a kmemleak_object structure\n");
+ kmemleak_disable();
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&object->object_list);
+ INIT_LIST_HEAD(&object->gray_list);
+ INIT_HLIST_HEAD(&object->area_list);
+ spin_lock_init(&object->lock);
+ atomic_set(&object->use_count, 1);
+ object->flags = OBJECT_ALLOCATED;
+ object->pointer = ptr;
+ object->size = size;
+ object->min_count = min_count;
+ object->count = 0; /* white color initially */
+ object->jiffies = jiffies;
+ object->checksum = 0;
+
+ /* task information */
+ if (in_irq()) {
+ object->pid = 0;
+ strncpy(object->comm, "hardirq", sizeof(object->comm));
+ } else if (in_softirq()) {
+ object->pid = 0;
+ strncpy(object->comm, "softirq", sizeof(object->comm));
+ } else {
+ object->pid = current->pid;
+ /*
+ * There is a small chance of a race with set_task_comm(),
+ * however using get_task_comm() here may cause locking
+ * dependency issues with current->alloc_lock. In the worst
+ * case, the command line is not correct.
+ */
+ strncpy(object->comm, current->comm, sizeof(object->comm));
+ }
+
+ /* kernel backtrace */
+ object->trace_len = __save_stack_trace(object->trace);
+
+ write_lock_irqsave(&kmemleak_lock, flags);
+
+ min_addr = min(min_addr, ptr);
+ max_addr = max(max_addr, ptr + size);
+ link = &object_tree_root.rb_node;
+ rb_parent = NULL;
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+ if (ptr + size <= parent->pointer)
+ link = &parent->rb_node.rb_left;
+ else if (parent->pointer + parent->size <= ptr)
+ link = &parent->rb_node.rb_right;
+ else {
+ kmemleak_stop("Cannot insert 0x%lx into the object "
+ "search tree (overlaps existing)\n",
+ ptr);
+ kmem_cache_free(object_cache, object);
+ object = parent;
+ spin_lock(&object->lock);
+ dump_object_info(object);
+ spin_unlock(&object->lock);
+ goto out;
+ }
+ }
+ rb_link_node(&object->rb_node, rb_parent, link);
+ rb_insert_color(&object->rb_node, &object_tree_root);
+
+ list_add_tail_rcu(&object->object_list, &object_list);
+out:
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+ return object;
+}
+
+/*
+ * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * object_list and object_tree_root and decrement its use_count.
+ */
+static void __delete_object(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&kmemleak_lock, flags);
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+ WARN_ON(atomic_read(&object->use_count) < 2);
+
+ /*
+ * Locking here also ensures that the corresponding memory block
+ * cannot be freed when it is being scanned.
+ */
+ spin_lock_irqsave(&object->lock, flags);
+ object->flags &= ~OBJECT_ALLOCATED;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it.
+ */
+static void delete_object_full(unsigned long ptr)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Freeing unknown object at 0x%08lx\n",
+ ptr);
+#endif
+ return;
+ }
+ __delete_object(object);
+ put_object(object);
+}
+
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it. If the memory block is partially freed, the function may create
+ * additional metadata for the remaining parts of the block.
+ */
+static void delete_object_part(unsigned long ptr, size_t size)
+{
+ struct kmemleak_object *object;
+ unsigned long start, end;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx "
+ "(size %zu)\n", ptr, size);
+#endif
+ return;
+ }
+ __delete_object(object);
+
+ /*
+ * Create one or two objects that may result from the memory block
+ * split. Note that partial freeing is only done by free_bootmem() and
+ * this happens before kmemleak_init() is called. The path below is
+ * only executed during early log recording in kmemleak_init(), so
+ * GFP_KERNEL is enough.
+ */
+ start = object->pointer;
+ end = object->pointer + object->size;
+ if (ptr > start)
+ create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL);
+ if (ptr + size < end)
+ create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL);
+
+ put_object(object);
+}
+
+static void __paint_it(struct kmemleak_object *object, int color)
+{
+ object->min_count = color;
+ if (color == KMEMLEAK_BLACK)
+ object->flags |= OBJECT_NO_SCAN;
+}
+
+static void paint_it(struct kmemleak_object *object, int color)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&object->lock, flags);
+ __paint_it(object, color);
+ spin_unlock_irqrestore(&object->lock, flags);
+}
+
+static void paint_ptr(unsigned long ptr, int color)
+{
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Trying to color unknown object "
+ "at 0x%08lx as %s\n", ptr,
+ (color == KMEMLEAK_GREY) ? "Grey" :
+ (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
+ return;
+ }
+ paint_it(object, color);
+ put_object(object);
+}
+
+/*
+ * Mark an object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_GREY);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+ paint_ptr(ptr, KMEMLEAK_BLACK);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ struct kmemleak_scan_area *area;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+ kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
+ ptr);
+ return;
+ }
+
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+ if (!area) {
+ pr_warning("Cannot allocate a scan area\n");
+ goto out;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ if (size == SIZE_MAX) {
+ size = object->pointer + object->size - ptr;
+ } else if (ptr + size > object->pointer + object->size) {
+ kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+ dump_object_info(object);
+ kmem_cache_free(scan_area_cache, area);
+ goto out_unlock;
+ }
+
+ INIT_HLIST_NODE(&area->node);
+ area->start = ptr;
+ area->size = size;
+
+ hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+ spin_unlock_irqrestore(&object->lock, flags);
+out:
+ put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ object = find_and_get_object(ptr, 0);
+ if (!object) {
+ kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr);
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->flags |= OBJECT_NO_SCAN;
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+}
+
+/*
+ * Log an early kmemleak_* call to the early_log buffer. These calls will be
+ * processed later once kmemleak is fully initialized.
+ */
+static void __init log_early(int op_type, const void *ptr, size_t size,
+ int min_count)
+{
+ unsigned long flags;
+ struct early_log *log;
+
+ if (kmemleak_error) {
+ /* kmemleak stopped recording, just count the requests */
+ crt_early_log++;
+ return;
+ }
+
+ if (crt_early_log >= ARRAY_SIZE(early_log)) {
+ kmemleak_disable();
+ return;
+ }
+
+ /*
+ * There is no need for locking since the kernel is still in UP mode
+ * at this stage. Disabling the IRQs is enough.
+ */
+ local_irq_save(flags);
+ log = &early_log[crt_early_log];
+ log->op_type = op_type;
+ log->ptr = ptr;
+ log->size = size;
+ log->min_count = min_count;
+ log->trace_len = __save_stack_trace(log->trace);
+ crt_early_log++;
+ local_irq_restore(flags);
+}
+
+/*
+ * Log an early allocated block and populate the stack trace.
+ */
+static void early_alloc(struct early_log *log)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+ int i;
+
+ if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
+ return;
+
+ /*
+ * RCU locking needed to ensure object is not freed via put_object().
+ */
+ rcu_read_lock();
+ object = create_object((unsigned long)log->ptr, log->size,
+ log->min_count, GFP_ATOMIC);
+ if (!object)
+ goto out;
+ spin_lock_irqsave(&object->lock, flags);
+ for (i = 0; i < log->trace_len; i++)
+ object->trace[i] = log->trace[i];
+ object->trace_len = log->trace_len;
+ spin_unlock_irqrestore(&object->lock, flags);
+out:
+ rcu_read_unlock();
+}
+
+/*
+ * Log an early allocated block and populate the stack trace.
+ */
+static void early_alloc_percpu(struct early_log *log)
+{
+ unsigned int cpu;
+ const void __percpu *ptr = log->ptr;
+
+ for_each_possible_cpu(cpu) {
+ log->ptr = per_cpu_ptr(ptr, cpu);
+ early_alloc(log);
+ }
+}
+
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
+ */
+void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp)
+{
+ pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ create_object((unsigned long)ptr, size, min_count, gfp);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/**
+ * kmemleak_alloc_percpu - register a newly allocated __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ * @size: size of the object
+ *
+ * This function is called from the kernel percpu allocator when a new object
+ * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
+ * allocation.
+ */
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size);
+
+ /*
+ * Percpu allocations are only scanned and not reported as leaks
+ * (min_count is set to 0).
+ */
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ create_object((unsigned long)per_cpu_ptr(ptr, cpu),
+ size, 0, GFP_KERNEL);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
+
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void __ref kmemleak_free(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ delete_object_full((unsigned long)ptr);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_FREE, ptr, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
+ */
+void __ref kmemleak_free_part(const void *ptr, size_t size)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ delete_object_part((unsigned long)ptr, size);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_part);
+
+/**
+ * kmemleak_free_percpu - unregister a previously registered __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ *
+ * This function is called from the kernel percpu allocator when an object
+ * (memory block) is freed (free_percpu).
+ */
+void __ref kmemleak_free_percpu(const void __percpu *ptr)
+{
+ unsigned int cpu;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ delete_object_full((unsigned long)per_cpu_ptr(ptr,
+ cpu));
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
+
+/**
+ * kmemleak_update_trace - update object allocation stack trace
+ * @ptr: pointer to beginning of the object
+ *
+ * Override the object allocation stack trace for cases where the actual
+ * allocation place is not always useful.
+ */
+void __ref kmemleak_update_trace(const void *ptr)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
+ return;
+
+ object = find_and_get_object((unsigned long)ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Updating stack trace for unknown object at %p\n",
+ ptr);
+#endif
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->trace_len = __save_stack_trace(object->trace);
+ spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+}
+EXPORT_SYMBOL(kmemleak_update_trace);
+
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
+ */
+void __ref kmemleak_not_leak(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ make_gray_object((unsigned long)ptr);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
+ */
+void __ref kmemleak_ignore(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ make_black_object((unsigned long)ptr);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
+ */
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
+ add_scan_area((unsigned long)ptr, size, gfp);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
+ */
+void __ref kmemleak_no_scan(const void *ptr)
+{
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ object_no_scan((unsigned long)ptr);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+ u32 old_csum = object->checksum;
+
+ if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+ return false;
+
+ object->checksum = crc32(0, (void *)object->pointer, object->size);
+ return object->checksum != old_csum;
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occurred.
+ */
+static int scan_should_stop(void)
+{
+ if (!kmemleak_enabled)
+ return 1;
+
+ /*
+ * This function may be called from either process or kthread context,
+ * hence the need to check for both stop conditions.
+ */
+ if (current->mm)
+ return signal_pending(current);
+ else
+ return kthread_should_stop();
+
+ return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+ struct kmemleak_object *scanned, int allow_resched)
+{
+ unsigned long *ptr;
+ unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+ unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+
+ for (ptr = start; ptr < end; ptr++) {
+ struct kmemleak_object *object;
+ unsigned long flags;
+ unsigned long pointer;
+
+ if (allow_resched)
+ cond_resched();
+ if (scan_should_stop())
+ break;
+
+ /* don't scan uninitialized memory */
+ if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
+ BYTES_PER_POINTER))
+ continue;
+
+ pointer = *ptr;
+
+ object = find_and_get_object(pointer, 1);
+ if (!object)
+ continue;
+ if (object == scanned) {
+ /* self referenced, ignore */
+ put_object(object);
+ continue;
+ }
+
+ /*
+ * Avoid the lockdep recursive warning on object->lock being
+ * previously acquired in scan_object(). These locks are
+ * enclosed by scan_mutex.
+ */
+ spin_lock_irqsave_nested(&object->lock, flags,
+ SINGLE_DEPTH_NESTING);
+ if (!color_white(object)) {
+ /* non-orphan, ignored or new */
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+ continue;
+ }
+
+ /*
+ * Increase the object's reference count (number of pointers
+ * to the memory block). If this count reaches the required
+ * minimum, the object's color will become gray and it will be
+ * added to the gray_list.
+ */
+ object->count++;
+ if (color_gray(object)) {
+ list_add_tail(&object->gray_list, &gray_list);
+ spin_unlock_irqrestore(&object->lock, flags);
+ continue;
+ }
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ put_object(object);
+ }
+}
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+ struct kmemleak_scan_area *area;
+ unsigned long flags;
+
+ /*
+ * Once the object->lock is acquired, the corresponding memory block
+ * cannot be freed (the same lock is acquired in delete_object).
+ */
+ spin_lock_irqsave(&object->lock, flags);
+ if (object->flags & OBJECT_NO_SCAN)
+ goto out;
+ if (!(object->flags & OBJECT_ALLOCATED))
+ /* already freed object */
+ goto out;
+ if (hlist_empty(&object->area_list)) {
+ void *start = (void *)object->pointer;
+ void *end = (void *)(object->pointer + object->size);
+
+ while (start < end && (object->flags & OBJECT_ALLOCATED) &&
+ !(object->flags & OBJECT_NO_SCAN)) {
+ scan_block(start, min(start + MAX_SCAN_SIZE, end),
+ object, 0);
+ start += MAX_SCAN_SIZE;
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&object->lock, flags);
+ }
+ } else
+ hlist_for_each_entry(area, &object->area_list, node)
+ scan_block((void *)area->start,
+ (void *)(area->start + area->size),
+ object, 0);
+out:
+ spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+ struct kmemleak_object *object, *tmp;
+
+ /*
+ * The list traversal is safe for both tail additions and removals
+ * from inside the loop. The kmemleak objects cannot be freed from
+ * outside the loop because their use_count was incremented.
+ */
+ object = list_entry(gray_list.next, typeof(*object), gray_list);
+ while (&object->gray_list != &gray_list) {
+ cond_resched();
+
+ /* may add new objects to the list */
+ if (!scan_should_stop())
+ scan_object(object);
+
+ tmp = list_entry(object->gray_list.next, typeof(*object),
+ gray_list);
+
+ /* remove the object from the list and release it */
+ list_del(&object->gray_list);
+ put_object(object);
+
+ object = tmp;
+ }
+ WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ int i;
+ int new_leaks = 0;
+
+ jiffies_last_scan = jiffies;
+
+ /* prepare the kmemleak_object's */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+ /*
+ * With a few exceptions there should be a maximum of
+ * 1 reference to any object at this point.
+ */
+ if (atomic_read(&object->use_count) > 1) {
+ pr_debug("object->use_count = %d\n",
+ atomic_read(&object->use_count));
+ dump_object_info(object);
+ }
+#endif
+ /* reset the reference count (whiten the object) */
+ object->count = 0;
+ if (color_gray(object) && get_object(object))
+ list_add_tail(&object->gray_list, &gray_list);
+
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ /* data/bss scanning */
+ scan_block(_sdata, _edata, NULL, 1);
+ scan_block(__bss_start, __bss_stop, NULL, 1);
+
+#ifdef CONFIG_SMP
+ /* per-cpu sections scanning */
+ for_each_possible_cpu(i)
+ scan_block(__per_cpu_start + per_cpu_offset(i),
+ __per_cpu_end + per_cpu_offset(i), NULL, 1);
+#endif
+
+ /*
+ * Struct page scanning for each node.
+ */
+ get_online_mems();
+ for_each_online_node(i) {
+ unsigned long start_pfn = node_start_pfn(i);
+ unsigned long end_pfn = node_end_pfn(i);
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ /* only scan if page is in use */
+ if (page_count(page) == 0)
+ continue;
+ scan_block(page, page + 1, NULL, 1);
+ }
+ }
+ put_online_mems();
+
+ /*
+ * Scanning the task stacks (may introduce false negatives).
+ */
+ if (kmemleak_stack_scan) {
+ struct task_struct *p, *g;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ scan_block(task_stack_page(p), task_stack_page(p) +
+ THREAD_SIZE, NULL, 0);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ }
+
+ /*
+ * Scan the objects already referenced from the sections scanned
+ * above.
+ */
+ scan_gray_list();
+
+ /*
+ * Check for new or unreferenced objects modified since the previous
+ * scan and color them gray until the next scan.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
+ && update_checksum(object) && get_object(object)) {
+ /* color it gray temporarily */
+ object->count = object->min_count;
+ list_add_tail(&object->gray_list, &gray_list);
+ }
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ /*
+ * Re-scan the gray list for modified unreferenced objects.
+ */
+ scan_gray_list();
+
+ /*
+ * If scanning was stopped do not report any new unreferenced objects.
+ */
+ if (scan_should_stop())
+ return;
+
+ /*
+ * Scanning result reporting.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if (unreferenced_object(object) &&
+ !(object->flags & OBJECT_REPORTED)) {
+ object->flags |= OBJECT_REPORTED;
+ new_leaks++;
+ }
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ if (new_leaks) {
+ kmemleak_found_leaks = true;
+
+ pr_info("%d new suspected memory leaks (see "
+ "/sys/kernel/debug/kmemleak)\n", new_leaks);
+ }
+
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+ static int first_run = 1;
+
+ pr_info("Automatic memory scanning thread started\n");
+ set_user_nice(current, 10);
+
+ /*
+ * Wait before the first scan to allow the system to fully initialize.
+ */
+ if (first_run) {
+ first_run = 0;
+ ssleep(SECS_FIRST_SCAN);
+ }
+
+ while (!kthread_should_stop()) {
+ signed long timeout = jiffies_scan_wait;
+
+ mutex_lock(&scan_mutex);
+ kmemleak_scan();
+ mutex_unlock(&scan_mutex);
+
+ /* wait before the next scan */
+ while (timeout && !kthread_should_stop())
+ timeout = schedule_timeout_interruptible(timeout);
+ }
+
+ pr_info("Automatic memory scanning thread ended\n");
+
+ return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the scan_mutex held.
+ */
+static void start_scan_thread(void)
+{
+ if (scan_thread)
+ return;
+ scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+ if (IS_ERR(scan_thread)) {
+ pr_warning("Failed to create the scan thread\n");
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Stop the automatic memory scanning thread. This function must be called
+ * with the scan_mutex held.
+ */
+static void stop_scan_thread(void)
+{
+ if (scan_thread) {
+ kthread_stop(scan_thread);
+ scan_thread = NULL;
+ }
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct kmemleak_object *object;
+ loff_t n = *pos;
+ int err;
+
+ err = mutex_lock_interruptible(&scan_mutex);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ if (n-- > 0)
+ continue;
+ if (get_object(object))
+ goto out;
+ }
+ object = NULL;
+out:
+ return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct kmemleak_object *prev_obj = v;
+ struct kmemleak_object *next_obj = NULL;
+ struct kmemleak_object *obj = prev_obj;
+
+ ++(*pos);
+
+ list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
+ if (get_object(obj)) {
+ next_obj = obj;
+ break;
+ }
+ }
+
+ put_object(prev_obj);
+ return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!IS_ERR(v)) {
+ /*
+ * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+ * waiting was interrupted, so only release it if !IS_ERR.
+ */
+ rcu_read_unlock();
+ mutex_unlock(&scan_mutex);
+ if (v)
+ put_object(v);
+ }
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+ struct kmemleak_object *object = v;
+ unsigned long flags;
+
+ spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
+ print_unreferenced(seq, object);
+ spin_unlock_irqrestore(&object->lock, flags);
+ return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+ .start = kmemleak_seq_start,
+ .next = kmemleak_seq_next,
+ .stop = kmemleak_seq_stop,
+ .show = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &kmemleak_seq_ops);
+}
+
+static int dump_str_object_info(const char *str)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+ unsigned long addr;
+
+ if (kstrtoul(str, 0, &addr))
+ return -EINVAL;
+ object = find_and_get_object(addr, 0);
+ if (!object) {
+ pr_info("Unknown object at 0x%08lx\n", addr);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ dump_object_info(object);
+ spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+ return 0;
+}
+
+/*
+ * We use grey instead of black to ensure we can do future scans on the same
+ * objects. If we did not do future scans these black objects could
+ * potentially contain references to newly allocated objects in the future and
+ * we'd end up with false positives.
+ */
+static void kmemleak_clear(void)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list) {
+ spin_lock_irqsave(&object->lock, flags);
+ if ((object->flags & OBJECT_REPORTED) &&
+ unreferenced_object(object))
+ __paint_it(object, KMEMLEAK_GREY);
+ spin_unlock_irqrestore(&object->lock, flags);
+ }
+ rcu_read_unlock();
+
+ kmemleak_found_leaks = false;
+}
+
+static void __kmemleak_do_cleanup(void);
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ * off - disable kmemleak (irreversible)
+ * stack=on - enable the task stacks scanning
+ * stack=off - disable the tasks stacks scanning
+ * scan=on - start the automatic memory scanning thread
+ * scan=off - stop the automatic memory scanning thread
+ * scan=... - set the automatic memory scanning period in seconds (0 to
+ * disable it)
+ * scan - trigger a memory scan
+ * clear - mark all current reported unreferenced kmemleak objects as
+ * grey to ignore printing them, or free all kmemleak objects
+ * if kmemleak has been disabled.
+ * dump=... - dump information about the object found at the given address
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+ size_t size, loff_t *ppos)
+{
+ char buf[64];
+ int buf_size;
+ int ret;
+
+ buf_size = min(size, (sizeof(buf) - 1));
+ if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+ return -EFAULT;
+ buf[buf_size] = 0;
+
+ ret = mutex_lock_interruptible(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (strncmp(buf, "clear", 5) == 0) {
+ if (kmemleak_enabled)
+ kmemleak_clear();
+ else
+ __kmemleak_do_cleanup();
+ goto out;
+ }
+
+ if (!kmemleak_enabled) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (strncmp(buf, "off", 3) == 0)
+ kmemleak_disable();
+ else if (strncmp(buf, "stack=on", 8) == 0)
+ kmemleak_stack_scan = 1;
+ else if (strncmp(buf, "stack=off", 9) == 0)
+ kmemleak_stack_scan = 0;
+ else if (strncmp(buf, "scan=on", 7) == 0)
+ start_scan_thread();
+ else if (strncmp(buf, "scan=off", 8) == 0)
+ stop_scan_thread();
+ else if (strncmp(buf, "scan=", 5) == 0) {
+ unsigned long secs;
+
+ ret = kstrtoul(buf + 5, 0, &secs);
+ if (ret < 0)
+ goto out;
+ stop_scan_thread();
+ if (secs) {
+ jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+ start_scan_thread();
+ }
+ } else if (strncmp(buf, "scan", 4) == 0)
+ kmemleak_scan();
+ else if (strncmp(buf, "dump=", 5) == 0)
+ ret = dump_str_object_info(buf + 5);
+ else
+ ret = -EINVAL;
+
+out:
+ mutex_unlock(&scan_mutex);
+ if (ret < 0)
+ return ret;
+
+ /* ignore the rest of the buffer, only one command at a time */
+ *ppos += size;
+ return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+ .owner = THIS_MODULE,
+ .open = kmemleak_open,
+ .read = seq_read,
+ .write = kmemleak_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void __kmemleak_do_cleanup(void)
+{
+ struct kmemleak_object *object;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list)
+ delete_object_full(object->pointer);
+ rcu_read_unlock();
+}
+
+/*
+ * Stop the memory scanning thread and free the kmemleak internal objects if
+ * no previous scan thread (otherwise, kmemleak may still have some useful
+ * information on memory leaks).
+ */
+static void kmemleak_do_cleanup(struct work_struct *work)
+{
+ mutex_lock(&scan_mutex);
+ stop_scan_thread();
+
+ if (!kmemleak_found_leaks)
+ __kmemleak_do_cleanup();
+ else
+ pr_info("Kmemleak disabled without freeing internal data. "
+ "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ mutex_unlock(&scan_mutex);
+}
+
+static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+ /* atomically check whether it was already invoked */
+ if (cmpxchg(&kmemleak_error, 0, 1))
+ return;
+
+ /* stop any memory operation tracing */
+ kmemleak_enabled = 0;
+
+ /* check whether it is too early for a kernel thread */
+ if (kmemleak_initialized)
+ schedule_work(&cleanup_work);
+
+ pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int kmemleak_boot_config(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (strcmp(str, "off") == 0)
+ kmemleak_disable();
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+static void __init print_log_trace(struct early_log *log)
+{
+ struct stack_trace trace;
+
+ trace.nr_entries = log->trace_len;
+ trace.entries = log->trace;
+
+ pr_notice("Early log backtrace:\n");
+ print_stack_trace(&trace, 2);
+}
+
+/*
+ * Kmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+ int i;
+ unsigned long flags;
+
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_early_log = 0;
+ kmemleak_disable();
+ return;
+ }
+#endif
+
+ jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+ jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+ object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+ scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+
+ if (crt_early_log >= ARRAY_SIZE(early_log))
+ pr_warning("Early log buffer exceeded (%d), please increase "
+ "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+
+ /* the kernel is still in UP mode, so disabling the IRQs is enough */
+ local_irq_save(flags);
+ kmemleak_early_log = 0;
+ if (kmemleak_error) {
+ local_irq_restore(flags);
+ return;
+ } else
+ kmemleak_enabled = 1;
+ local_irq_restore(flags);
+
+ /*
+ * This is the point where tracking allocations is safe. Automatic
+ * scanning is started during the late initcall. Add the early logged
+ * callbacks to the kmemleak infrastructure.
+ */
+ for (i = 0; i < crt_early_log; i++) {
+ struct early_log *log = &early_log[i];
+
+ switch (log->op_type) {
+ case KMEMLEAK_ALLOC:
+ early_alloc(log);
+ break;
+ case KMEMLEAK_ALLOC_PERCPU:
+ early_alloc_percpu(log);
+ break;
+ case KMEMLEAK_FREE:
+ kmemleak_free(log->ptr);
+ break;
+ case KMEMLEAK_FREE_PART:
+ kmemleak_free_part(log->ptr, log->size);
+ break;
+ case KMEMLEAK_FREE_PERCPU:
+ kmemleak_free_percpu(log->ptr);
+ break;
+ case KMEMLEAK_NOT_LEAK:
+ kmemleak_not_leak(log->ptr);
+ break;
+ case KMEMLEAK_IGNORE:
+ kmemleak_ignore(log->ptr);
+ break;
+ case KMEMLEAK_SCAN_AREA:
+ kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
+ break;
+ case KMEMLEAK_NO_SCAN:
+ kmemleak_no_scan(log->ptr);
+ break;
+ default:
+ kmemleak_warn("Unknown early log operation: %d\n",
+ log->op_type);
+ }
+
+ if (kmemleak_warning) {
+ print_log_trace(log);
+ kmemleak_warning = 0;
+ }
+ }
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+ struct dentry *dentry;
+
+ kmemleak_initialized = 1;
+
+ if (kmemleak_error) {
+ /*
+ * Some error occurred and kmemleak was disabled. There is a
+ * small chance that kmemleak_disable() was called immediately
+ * after setting kmemleak_initialized and we may end up with
+ * two clean-up threads but serialized by scan_mutex.
+ */
+ schedule_work(&cleanup_work);
+ return -ENOMEM;
+ }
+
+ dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+ &kmemleak_fops);
+ if (!dentry)
+ pr_warning("Failed to create the debugfs kmemleak file\n");
+ mutex_lock(&scan_mutex);
+ start_scan_thread();
+ mutex_unlock(&scan_mutex);
+
+ pr_info("Kernel memory leak detector initialized\n");
+
+ return 0;
+}
+late_initcall(kmemleak_late_init);
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 00000000000..346ddc9e4c0
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,2347 @@
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ * Hugh Dickins
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/hashtable.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
+#include <linux/numa.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#ifdef CONFIG_NUMA
+#define NUMA(x) (x)
+#define DO_NUMA(x) do { (x); } while (0)
+#else
+#define NUMA(x) (0)
+#define DO_NUMA(x) do { } while (0)
+#endif
+
+/*
+ * A few notes about the KSM scanning process,
+ * to make it easier to understand the data structures below:
+ *
+ * In order to reduce excessive scanning, KSM sorts the memory pages by their
+ * contents into a data structure that holds pointers to the pages' locations.
+ *
+ * Since the contents of the pages may change at any moment, KSM cannot just
+ * insert the pages into a normal sorted tree and expect it to find anything.
+ * Therefore KSM uses two data structures - the stable and the unstable tree.
+ *
+ * The stable tree holds pointers to all the merged pages (ksm pages), sorted
+ * by their contents. Because each such page is write-protected, searching on
+ * this tree is fully assured to be working (except when pages are unmapped),
+ * and therefore this tree is called the stable tree.
+ *
+ * In addition to the stable tree, KSM uses a second data structure called the
+ * unstable tree: this tree holds pointers to pages which have been found to
+ * be "unchanged for a period of time". The unstable tree sorts these pages
+ * by their contents, but since they are not write-protected, KSM cannot rely
+ * upon the unstable tree to work correctly - the unstable tree is liable to
+ * be corrupted as its contents are modified, and so it is called unstable.
+ *
+ * KSM solves this problem by several techniques:
+ *
+ * 1) The unstable tree is flushed every time KSM completes scanning all
+ * memory areas, and then the tree is rebuilt again from the beginning.
+ * 2) KSM will only insert into the unstable tree, pages whose hash value
+ * has not changed since the previous scan of all memory areas.
+ * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
+ * colors of the nodes and not on their contents, assuring that even when
+ * the tree gets "corrupted" it won't get out of balance, so scanning time
+ * remains the same (also, searching and inserting nodes in an rbtree uses
+ * the same algorithm, so we have no overhead when we flush and rebuild).
+ * 4) KSM never flushes the stable tree, which means that even if it were to
+ * take 10 attempts to find a page in the unstable tree, once it is found,
+ * it is secured in the stable tree. (When we scan a new page, we first
+ * compare it against the stable tree, and then against the unstable tree.)
+ *
+ * If the merge_across_nodes tunable is unset, then KSM maintains multiple
+ * stable trees and multiple unstable trees: one of each for each NUMA node.
+ */
+
+/**
+ * struct mm_slot - ksm information per mm that is being scanned
+ * @link: link to the mm_slots hash list
+ * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node link;
+ struct list_head mm_list;
+ struct rmap_item *rmap_list;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct ksm_scan - cursor for scanning
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ * @rmap_list: link to the next rmap to be scanned in the rmap_list
+ * @seqnr: count of completed full scans (needed when removing unstable node)
+ *
+ * There is only the one ksm_scan instance of this cursor structure.
+ */
+struct ksm_scan {
+ struct mm_slot *mm_slot;
+ unsigned long address;
+ struct rmap_item **rmap_list;
+ unsigned long seqnr;
+};
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @list: linked into migrate_nodes, pending placement in the proper node tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
+ */
+struct stable_node {
+ union {
+ struct rb_node node; /* when node of stable tree */
+ struct { /* when listed for migration */
+ struct list_head *head;
+ struct list_head list;
+ };
+ };
+ struct hlist_head hlist;
+ unsigned long kpfn;
+#ifdef CONFIG_NUMA
+ int nid;
+#endif
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @nid: NUMA node id of unstable tree in which linked (may not match page)
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @oldchecksum: previous checksum of the page at that virtual address
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+ struct rmap_item *rmap_list;
+ union {
+ struct anon_vma *anon_vma; /* when stable */
+#ifdef CONFIG_NUMA
+ int nid; /* when node of unstable tree */
+#endif
+ };
+ struct mm_struct *mm;
+ unsigned long address; /* + low bits used for flags below */
+ unsigned int oldchecksum; /* when unstable */
+ union {
+ struct rb_node node; /* when node of unstable tree */
+ struct { /* when listed from stable tree */
+ struct stable_node *head;
+ struct hlist_node hlist;
+ };
+ };
+};
+
+#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
+#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
+#define STABLE_FLAG 0x200 /* is listed from the stable tree */
+
+/* The stable and unstable tree heads */
+static struct rb_root one_stable_tree[1] = { RB_ROOT };
+static struct rb_root one_unstable_tree[1] = { RB_ROOT };
+static struct rb_root *root_stable_tree = one_stable_tree;
+static struct rb_root *root_unstable_tree = one_unstable_tree;
+
+/* Recently migrated nodes of stable tree, pending proper placement */
+static LIST_HEAD(migrate_nodes);
+
+#define MM_SLOTS_HASH_BITS 10
+static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
+
+static struct mm_slot ksm_mm_head = {
+ .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+};
+static struct ksm_scan ksm_scan = {
+ .mm_slot = &ksm_mm_head,
+};
+
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
+static struct kmem_cache *mm_slot_cache;
+
+/* The number of nodes in the stable tree */
+static unsigned long ksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long ksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long ksm_pages_unshared;
+
+/* The number of rmap_items in use: to calculate pages_volatile */
+static unsigned long ksm_rmap_items;
+
+/* Number of pages ksmd should scan in one batch */
+static unsigned int ksm_thread_pages_to_scan = 100;
+
+/* Milliseconds ksmd should sleep between batches */
+static unsigned int ksm_thread_sleep_millisecs = 20;
+
+#ifdef CONFIG_NUMA
+/* Zeroed when merging across nodes is not allowed */
+static unsigned int ksm_merge_across_nodes = 1;
+static int ksm_nr_node_ids = 1;
+#else
+#define ksm_merge_across_nodes 1U
+#define ksm_nr_node_ids 1
+#endif
+
+#define KSM_RUN_STOP 0
+#define KSM_RUN_MERGE 1
+#define KSM_RUN_UNMERGE 2
+#define KSM_RUN_OFFLINE 4
+static unsigned long ksm_run = KSM_RUN_STOP;
+static void wait_while_offlining(void);
+
+static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DEFINE_MUTEX(ksm_thread_mutex);
+static DEFINE_SPINLOCK(ksm_mmlist_lock);
+
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+ sizeof(struct __struct), __alignof__(struct __struct),\
+ (__flags), NULL)
+
+static int __init ksm_slab_init(void)
+{
+ rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ if (!rmap_item_cache)
+ goto out;
+
+ stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ if (!stable_node_cache)
+ goto out_free1;
+
+ mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ if (!mm_slot_cache)
+ goto out_free2;
+
+ return 0;
+
+out_free2:
+ kmem_cache_destroy(stable_node_cache);
+out_free1:
+ kmem_cache_destroy(rmap_item_cache);
+out:
+ return -ENOMEM;
+}
+
+static void __init ksm_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ kmem_cache_destroy(stable_node_cache);
+ kmem_cache_destroy(rmap_item_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+ struct rmap_item *rmap_item;
+
+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
+ if (rmap_item)
+ ksm_rmap_items++;
+ return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+ ksm_rmap_items--;
+ rmap_item->mm = NULL; /* debug safety */
+ kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct stable_node *alloc_stable_node(void)
+{
+ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+ kmem_cache_free(stable_node_cache, stable_node);
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *slot;
+
+ hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
+ if (slot->mm == mm)
+ return slot;
+
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ mm_slot->mm = mm;
+ hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
+}
+
+/*
+ * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
+ * page tables after it has passed through ksm_exit() - which, if necessary,
+ * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * a special flag: they can just back out as soon as mm_users goes to zero.
+ * ksm_test_exit() is used throughout to make this test for exit: in some
+ * places for correctness, in some places just to avoid unnecessary work.
+ */
+static inline bool ksm_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ int ret = 0;
+
+ do {
+ cond_resched();
+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ if (IS_ERR_OR_NULL(page))
+ break;
+ if (PageKsm(page))
+ ret = handle_mm_fault(vma->vm_mm, vma, addr,
+ FAULT_FLAG_WRITE);
+ else
+ ret = VM_FAULT_WRITE;
+ put_page(page);
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+ /*
+ * We must loop because handle_mm_fault() may back out if there's
+ * any difficulty e.g. if pte accessed bit gets updated concurrently.
+ *
+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+ * COW has been broken, even if the vma does not permit VM_WRITE;
+ * but note that a concurrent fault might break PageKsm for us.
+ *
+ * VM_FAULT_SIGBUS could occur if we race with truncation of the
+ * backing file, which also invalidates anonymous pages: that's
+ * okay, that truncation will have unmapped the PageKsm for us.
+ *
+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+ * current task has TIF_MEMDIE set, and will be OOM killed on return
+ * to user; and ksmd, having no mm, would never be chosen for that.
+ *
+ * But if the mm is in a limited mem_cgroup, then the fault may fail
+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+ * even ksmd can fail in this way - though it's usually breaking ksm
+ * just to undo a merge it made a moment before, so unlikely to oom.
+ *
+ * That's a pity: we might therefore have more kernel pages allocated
+ * than we're counting as nodes in the stable tree; but ksm_do_scan
+ * will retry to break_cow on each pass, so should recover the page
+ * in due course. The important thing is to not let VM_MERGEABLE
+ * be cleared while any such pages might remain in the area.
+ */
+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+ if (ksm_test_exit(mm))
+ return NULL;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ return NULL;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ return NULL;
+ return vma;
+}
+
+static void break_cow(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+
+ /*
+ * It is not an accident that whenever we want to break COW
+ * to undo, we also need to drop a reference to the anon_vma.
+ */
+ put_anon_vma(rmap_item->anon_vma);
+
+ down_read(&mm->mmap_sem);
+ vma = find_mergeable_vma(mm, addr);
+ if (vma)
+ break_ksm(vma, addr);
+ up_read(&mm->mmap_sem);
+}
+
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
+static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address;
+ struct vm_area_struct *vma;
+ struct page *page;
+
+ down_read(&mm->mmap_sem);
+ vma = find_mergeable_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ page = follow_page(vma, addr, FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
+ flush_anon_page(vma, page, addr);
+ flush_dcache_page(page);
+ } else {
+ put_page(page);
+out: page = NULL;
+ }
+ up_read(&mm->mmap_sem);
+ return page;
+}
+
+/*
+ * This helper is used for getting right index into array of tree roots.
+ * When merge_across_nodes knob is set to 1, there are only two rb-trees for
+ * stable and unstable pages from all nodes with roots in index 0. Otherwise,
+ * every node has its own stable and unstable tree.
+ */
+static inline int get_kpfn_nid(unsigned long kpfn)
+{
+ return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
+}
+
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+ struct rmap_item *rmap_item;
+
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->address &= PAGE_MASK;
+ cond_resched();
+ }
+
+ if (stable_node->head == &migrate_nodes)
+ list_del(&stable_node->list);
+ else
+ rb_erase(&stable_node->node,
+ root_stable_tree + NUMA(stable_node->nid));
+ free_stable_node(stable_node);
+}
+
+/*
+ * get_ksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ * But beware, the stable node's page might be being migrated.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive. So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node. This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ */
+static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+{
+ struct page *page;
+ void *expected_mapping;
+ unsigned long kpfn;
+
+ expected_mapping = (void *)stable_node +
+ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+again:
+ kpfn = ACCESS_ONCE(stable_node->kpfn);
+ page = pfn_to_page(kpfn);
+
+ /*
+ * page is computed from kpfn, so on most architectures reading
+ * page->mapping is naturally ordered after reading node->kpfn,
+ * but on Alpha we need to be more careful.
+ */
+ smp_read_barrier_depends();
+ if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ goto stale;
+
+ /*
+ * We cannot do anything with the page while its refcount is 0.
+ * Usually 0 means free, or tail of a higher-order page: in which
+ * case this node is no longer referenced, and should be freed;
+ * however, it might mean that the page is under page_freeze_refs().
+ * The __remove_mapping() case is easy, again the node is now stale;
+ * but if page is swapcache in migrate_page_move_mapping(), it might
+ * still be our page, in which case it's essential to keep the node.
+ */
+ while (!get_page_unless_zero(page)) {
+ /*
+ * Another check for page->mapping != expected_mapping would
+ * work here too. We have chosen the !PageSwapCache test to
+ * optimize the common case, when the page is or is about to
+ * be freed: PageSwapCache is cleared (under spin_lock_irq)
+ * in the freeze_refs section of __remove_mapping(); but Anon
+ * page->mapping reset to NULL later, in free_pages_prepare().
+ */
+ if (!PageSwapCache(page))
+ goto stale;
+ cpu_relax();
+ }
+
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ put_page(page);
+ goto stale;
+ }
+
+ if (lock_it) {
+ lock_page(page);
+ if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto stale;
+ }
+ }
+ return page;
+
+stale:
+ /*
+ * We come here from above when page->mapping or !PageSwapCache
+ * suggests that the node is stale; but it might be under migration.
+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * before checking whether node->kpfn has been changed.
+ */
+ smp_rmb();
+ if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ goto again;
+ remove_node_from_stable_tree(stable_node);
+ return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+ if (rmap_item->address & STABLE_FLAG) {
+ struct stable_node *stable_node;
+ struct page *page;
+
+ stable_node = rmap_item->head;
+ page = get_ksm_page(stable_node, true);
+ if (!page)
+ goto out;
+
+ hlist_del(&rmap_item->hlist);
+ unlock_page(page);
+ put_page(page);
+
+ if (stable_node->hlist.first)
+ ksm_pages_sharing--;
+ else
+ ksm_pages_shared--;
+
+ put_anon_vma(rmap_item->anon_vma);
+ rmap_item->address &= PAGE_MASK;
+
+ } else if (rmap_item->address & UNSTABLE_FLAG) {
+ unsigned char age;
+ /*
+ * Usually ksmd can and must skip the rb_erase, because
+ * root_unstable_tree was already reset to RB_ROOT.
+ * But be careful when an mm is exiting: do the rb_erase
+ * if this rmap_item was inserted by this scan, rather
+ * than left over from before.
+ */
+ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
+ BUG_ON(age > 1);
+ if (!age)
+ rb_erase(&rmap_item->node,
+ root_unstable_tree + NUMA(rmap_item->nid));
+ ksm_pages_unshared--;
+ rmap_item->address &= PAGE_MASK;
+ }
+out:
+ cond_resched(); /* we're called from many long loops */
+}
+
+static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
+ struct rmap_item **rmap_list)
+{
+ while (*rmap_list) {
+ struct rmap_item *rmap_item = *rmap_list;
+ *rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ }
+}
+
+/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+ int err = 0;
+
+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+ if (ksm_test_exit(vma->vm_mm))
+ break;
+ if (signal_pending(current))
+ err = -ERESTARTSYS;
+ else
+ err = break_ksm(vma, addr);
+ }
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * Only called through the sysfs control interface:
+ */
+static int remove_stable_node(struct stable_node *stable_node)
+{
+ struct page *page;
+ int err;
+
+ page = get_ksm_page(stable_node, true);
+ if (!page) {
+ /*
+ * get_ksm_page did remove_node_from_stable_tree itself.
+ */
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(page_mapped(page))) {
+ /*
+ * This should not happen: but if it does, just refuse to let
+ * merge_across_nodes be switched - there is no need to panic.
+ */
+ err = -EBUSY;
+ } else {
+ /*
+ * The stable node did not yet appear stale to get_ksm_page(),
+ * since that allows for an unmapped ksm page to be recognized
+ * right up until it is freed; but the node is safe to remove.
+ * This page might be in a pagevec waiting to be freed,
+ * or it might be PageSwapCache (perhaps under writeback),
+ * or it might have been removed from swapcache a moment ago.
+ */
+ set_page_stable_node(page, NULL);
+ remove_node_from_stable_tree(stable_node);
+ err = 0;
+ }
+
+ unlock_page(page);
+ put_page(page);
+ return err;
+}
+
+static int remove_all_stable_nodes(void)
+{
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ int nid;
+ int err = 0;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ while (root_stable_tree[nid].rb_node) {
+ stable_node = rb_entry(root_stable_tree[nid].rb_node,
+ struct stable_node, node);
+ if (remove_stable_node(stable_node)) {
+ err = -EBUSY;
+ break; /* proceed to next nid */
+ }
+ cond_resched();
+ }
+ }
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this, struct stable_node, list);
+ if (remove_stable_node(stable_node))
+ err = -EBUSY;
+ cond_resched();
+ }
+ return err;
+}
+
+static int unmerge_and_remove_all_rmap_items(void)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
+ struct mm_slot, mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ for (mm_slot = ksm_scan.mm_slot;
+ mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (ksm_test_exit(mm))
+ break;
+ if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ continue;
+ err = unmerge_ksm_pages(vma,
+ vma->vm_start, vma->vm_end);
+ if (err)
+ goto error;
+ }
+
+ remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_test_exit(mm)) {
+ hash_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ up_read(&mm->mmap_sem);
+ mmdrop(mm);
+ } else {
+ spin_unlock(&ksm_mmlist_lock);
+ up_read(&mm->mmap_sem);
+ }
+ }
+
+ /* Clean up stable nodes, but don't worry if some are still busy */
+ remove_all_stable_nodes();
+ ksm_scan.seqnr = 0;
+ return 0;
+
+error:
+ up_read(&mm->mmap_sem);
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = &ksm_mm_head;
+ spin_unlock(&ksm_mmlist_lock);
+ return err;
+}
+#endif /* CONFIG_SYSFS */
+
+static u32 calc_checksum(struct page *page)
+{
+ u32 checksum;
+ void *addr = kmap_atomic(page);
+ checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ kunmap_atomic(addr);
+ return checksum;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+ return !memcmp_pages(page1, page2);
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+ pte_t *orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ int swapped;
+ int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ BUG_ON(PageTransCompound(page));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ ptep = page_check_address(page, mm, addr, &ptl, 0);
+ if (!ptep)
+ goto out_mn;
+
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
+ pte_t entry;
+
+ swapped = PageSwapCache(page);
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ /*
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
+ * take any lock, therefore the check that we are going to make
+ * with the pagecount against the mapcount is racey and
+ * O_DIRECT can happen right after the check.
+ * So we clear the pte and flush the tlb before the check
+ * this assure us that no O_DIRECT can happen after the check
+ * or in the middle of the check.
+ */
+ entry = ptep_clear_flush(vma, addr, ptep);
+ /*
+ * Check that no O_DIRECT or similar I/O is in progress on the
+ * page
+ */
+ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+ set_pte_at(mm, addr, ptep, entry);
+ goto out_unlock;
+ }
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
+ set_pte_at_notify(mm, addr, ptep, entry);
+ }
+ *orig_pte = *ptep;
+ err = 0;
+
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return err;
+}
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma: vma that holds the pte pointing to page
+ * @page: the page we are replacing by kpage
+ * @kpage: the ksm page we replace page by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, -EFAULT on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+ struct page *kpage, pte_t orig_pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ addr = page_address_in_vma(page, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!pte_same(*ptep, orig_pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out_mn;
+ }
+
+ get_page(kpage);
+ page_add_anon_rmap(kpage, vma, addr);
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+
+ page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
+ put_page(page);
+
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+ return err;
+}
+
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
+/*
+ * try_to_merge_one_page - take two pages and merge them into one
+ * @vma: the vma that holds the pte pointing to page
+ * @page: the PageAnon page that we want to replace with kpage
+ * @kpage: the PageKsm page that we want to map instead of page,
+ * or NULL the first time when we want to use page as kpage.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_one_page(struct vm_area_struct *vma,
+ struct page *page, struct page *kpage)
+{
+ pte_t orig_pte = __pte(0);
+ int err = -EFAULT;
+
+ if (page == kpage) /* ksm page forked */
+ return 0;
+
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
+ if (!PageAnon(page))
+ goto out;
+
+ /*
+ * We need the page lock to read a stable PageSwapCache in
+ * write_protect_page(). We use trylock_page() instead of
+ * lock_page() because we don't want to wait here - we
+ * prefer to continue scanning and merging different pages,
+ * then come back to this page when it is unlocked.
+ */
+ if (!trylock_page(page))
+ goto out;
+ /*
+ * If this anonymous page is mapped only here, its pte may need
+ * to be write-protected. If it's mapped elsewhere, all of its
+ * ptes are necessarily already write-protected. But in either
+ * case, we need to lock and check page_count is not raised.
+ */
+ if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (!kpage) {
+ /*
+ * While we hold page lock, upgrade page from
+ * PageAnon+anon_vma to PageKsm+NULL stable_node:
+ * stable_tree_insert() will update stable_node.
+ */
+ set_page_stable_node(page, NULL);
+ mark_page_accessed(page);
+ err = 0;
+ } else if (pages_identical(page, kpage))
+ err = replace_page(vma, page, kpage, orig_pte);
+ }
+
+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+ munlock_vma_page(page);
+ if (!PageMlocked(kpage)) {
+ unlock_page(page);
+ lock_page(kpage);
+ mlock_vma_page(kpage);
+ page = kpage; /* for final unlock */
+ }
+ }
+
+ unlock_page(page);
+out:
+ return err;
+}
+
+/*
+ * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
+ * but no new kernel page is allocated: kpage must already be a ksm page.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+ struct page *page, struct page *kpage)
+{
+ struct mm_struct *mm = rmap_item->mm;
+ struct vm_area_struct *vma;
+ int err = -EFAULT;
+
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ goto out;
+ vma = find_vma(mm, rmap_item->address);
+ if (!vma || vma->vm_start > rmap_item->address)
+ goto out;
+
+ err = try_to_merge_one_page(vma, page, kpage);
+ if (err)
+ goto out;
+
+ /* Unstable nid is in union with stable anon_vma: remove first */
+ remove_rmap_item_from_tree(rmap_item);
+
+ /* Must get reference to anon_vma while still holding mmap_sem */
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
+out:
+ up_read(&mm->mmap_sem);
+ return err;
+}
+
+/*
+ * try_to_merge_two_pages - take two identical pages and prepare them
+ * to be merged into one page.
+ *
+ * This function returns the kpage if we successfully merged two identical
+ * pages into one ksm page, NULL otherwise.
+ *
+ * Note that this function upgrades page to ksm page: if one of the pages
+ * is already a ksm page, try_to_merge_with_ksm_page should be used.
+ */
+static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+ struct page *page,
+ struct rmap_item *tree_rmap_item,
+ struct page *tree_page)
+{
+ int err;
+
+ err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
+ if (!err) {
+ err = try_to_merge_with_ksm_page(tree_rmap_item,
+ tree_page, page);
+ /*
+ * If that fails, we have a ksm page with only one pte
+ * pointing to it: so break it.
+ */
+ if (err)
+ break_cow(rmap_item);
+ }
+ return err ? NULL : page;
+}
+
+/*
+ * stable_tree_search - search for page inside the stable tree
+ *
+ * This function checks if there is a page inside the stable tree
+ * with identical content to the page that we are scanning right now.
+ *
+ * This function returns the stable tree node of identical content if found,
+ * NULL otherwise.
+ */
+static struct page *stable_tree_search(struct page *page)
+{
+ int nid;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent;
+ struct stable_node *stable_node;
+ struct stable_node *page_node;
+
+ page_node = page_stable_node(page);
+ if (page_node && page_node->head != &migrate_nodes) {
+ /* ksm page forked */
+ get_page(page);
+ return page;
+ }
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_stable_tree + nid;
+again:
+ new = &root->rb_node;
+ parent = NULL;
+
+ while (*new) {
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node, false);
+ if (!tree_page)
+ return NULL;
+
+ ret = memcmp_pages(page, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ /*
+ * Lock and unlock the stable_node's page (which
+ * might already have been migrated) so that page
+ * migration is sure to notice its raised count.
+ * It would be more elegant to return stable_node
+ * than kpage, but that involves more changes.
+ */
+ tree_page = get_ksm_page(stable_node, true);
+ if (tree_page) {
+ unlock_page(tree_page);
+ if (get_kpfn_nid(stable_node->kpfn) !=
+ NUMA(stable_node->nid)) {
+ put_page(tree_page);
+ goto replace;
+ }
+ return tree_page;
+ }
+ /*
+ * There is now a place for page_node, but the tree may
+ * have been rebalanced, so re-evaluate parent and new.
+ */
+ if (page_node)
+ goto again;
+ return NULL;
+ }
+ }
+
+ if (!page_node)
+ return NULL;
+
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_link_node(&page_node->node, parent, new);
+ rb_insert_color(&page_node->node, root);
+ get_page(page);
+ return page;
+
+replace:
+ if (page_node) {
+ list_del(&page_node->list);
+ DO_NUMA(page_node->nid = nid);
+ rb_replace_node(&stable_node->node, &page_node->node, root);
+ get_page(page);
+ } else {
+ rb_erase(&stable_node->node, root);
+ page = NULL;
+ }
+ stable_node->head = &migrate_nodes;
+ list_add(&stable_node->list, stable_node->head);
+ return page;
+}
+
+/*
+ * stable_tree_insert - insert stable tree node pointing to new ksm page
+ * into the stable tree.
+ *
+ * This function returns the stable tree node just allocated on success,
+ * NULL otherwise.
+ */
+static struct stable_node *stable_tree_insert(struct page *kpage)
+{
+ int nid;
+ unsigned long kpfn;
+ struct rb_root *root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct stable_node *stable_node;
+
+ kpfn = page_to_pfn(kpage);
+ nid = get_kpfn_nid(kpfn);
+ root = root_stable_tree + nid;
+ new = &root->rb_node;
+
+ while (*new) {
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ stable_node = rb_entry(*new, struct stable_node, node);
+ tree_page = get_ksm_page(stable_node, false);
+ if (!tree_page)
+ return NULL;
+
+ ret = memcmp_pages(kpage, tree_page);
+ put_page(tree_page);
+
+ parent = *new;
+ if (ret < 0)
+ new = &parent->rb_left;
+ else if (ret > 0)
+ new = &parent->rb_right;
+ else {
+ /*
+ * It is not a bug that stable_tree_search() didn't
+ * find this node: because at that time our page was
+ * not yet write-protected, so may have changed since.
+ */
+ return NULL;
+ }
+ }
+
+ stable_node = alloc_stable_node();
+ if (!stable_node)
+ return NULL;
+
+ INIT_HLIST_HEAD(&stable_node->hlist);
+ stable_node->kpfn = kpfn;
+ set_page_stable_node(kpage, stable_node);
+ DO_NUMA(stable_node->nid = nid);
+ rb_link_node(&stable_node->node, parent, new);
+ rb_insert_color(&stable_node->node, root);
+
+ return stable_node;
+}
+
+/*
+ * unstable_tree_search_insert - search for identical page,
+ * else insert rmap_item into the unstable tree.
+ *
+ * This function searches for a page in the unstable tree identical to the
+ * page currently being scanned; and if no identical page is found in the
+ * tree, we insert rmap_item as a new object into the unstable tree.
+ *
+ * This function returns pointer to rmap_item found to be identical
+ * to the currently scanned page, NULL otherwise.
+ *
+ * This function does both searching and inserting, because they share
+ * the same walking algorithm in an rbtree.
+ */
+static
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+ struct page *page,
+ struct page **tree_pagep)
+{
+ struct rb_node **new;
+ struct rb_root *root;
+ struct rb_node *parent = NULL;
+ int nid;
+
+ nid = get_kpfn_nid(page_to_pfn(page));
+ root = root_unstable_tree + nid;
+ new = &root->rb_node;
+
+ while (*new) {
+ struct rmap_item *tree_rmap_item;
+ struct page *tree_page;
+ int ret;
+
+ cond_resched();
+ tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ tree_page = get_mergeable_page(tree_rmap_item);
+ if (IS_ERR_OR_NULL(tree_page))
+ return NULL;
+
+ /*
+ * Don't substitute a ksm page for a forked page.
+ */
+ if (page == tree_page) {
+ put_page(tree_page);
+ return NULL;
+ }
+
+ ret = memcmp_pages(page, tree_page);
+
+ parent = *new;
+ if (ret < 0) {
+ put_page(tree_page);
+ new = &parent->rb_left;
+ } else if (ret > 0) {
+ put_page(tree_page);
+ new = &parent->rb_right;
+ } else if (!ksm_merge_across_nodes &&
+ page_to_nid(tree_page) != nid) {
+ /*
+ * If tree_page has been migrated to another NUMA node,
+ * it will be flushed out and put in the right unstable
+ * tree next time: only merge with it when across_nodes.
+ */
+ put_page(tree_page);
+ return NULL;
+ } else {
+ *tree_pagep = tree_page;
+ return tree_rmap_item;
+ }
+ }
+
+ rmap_item->address |= UNSTABLE_FLAG;
+ rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
+ DO_NUMA(rmap_item->nid = nid);
+ rb_link_node(&rmap_item->node, parent, new);
+ rb_insert_color(&rmap_item->node, root);
+
+ ksm_pages_unshared++;
+ return NULL;
+}
+
+/*
+ * stable_tree_append - add another rmap_item to the linked list of
+ * rmap_items hanging off a given node of the stable tree, all sharing
+ * the same ksm page.
+ */
+static void stable_tree_append(struct rmap_item *rmap_item,
+ struct stable_node *stable_node)
+{
+ rmap_item->head = stable_node;
+ rmap_item->address |= STABLE_FLAG;
+ hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
+
+ if (rmap_item->hlist.next)
+ ksm_pages_sharing++;
+ else
+ ksm_pages_shared++;
+}
+
+/*
+ * cmp_and_merge_page - first see if page can be merged into the stable tree;
+ * if not, compare checksum to previous and if it's the same, see if page can
+ * be inserted into the unstable tree, or merged with a page already there and
+ * both transferred to the stable tree.
+ *
+ * @page: the page that we are searching identical page to.
+ * @rmap_item: the reverse mapping into the virtual address of this page
+ */
+static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+{
+ struct rmap_item *tree_rmap_item;
+ struct page *tree_page = NULL;
+ struct stable_node *stable_node;
+ struct page *kpage;
+ unsigned int checksum;
+ int err;
+
+ stable_node = page_stable_node(page);
+ if (stable_node) {
+ if (stable_node->head != &migrate_nodes &&
+ get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
+ rb_erase(&stable_node->node,
+ root_stable_tree + NUMA(stable_node->nid));
+ stable_node->head = &migrate_nodes;
+ list_add(&stable_node->list, stable_node->head);
+ }
+ if (stable_node->head != &migrate_nodes &&
+ rmap_item->head == stable_node)
+ return;
+ }
+
+ /* We first start with searching the page inside the stable tree */
+ kpage = stable_tree_search(page);
+ if (kpage == page && rmap_item->head == stable_node) {
+ put_page(kpage);
+ return;
+ }
+
+ remove_rmap_item_from_tree(rmap_item);
+
+ if (kpage) {
+ err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
+ if (!err) {
+ /*
+ * The page was successfully merged:
+ * add its rmap_item to the stable tree.
+ */
+ lock_page(kpage);
+ stable_tree_append(rmap_item, page_stable_node(kpage));
+ unlock_page(kpage);
+ }
+ put_page(kpage);
+ return;
+ }
+
+ /*
+ * If the hash value of the page has changed from the last time
+ * we calculated it, this page is changing frequently: therefore we
+ * don't want to insert it in the unstable tree, and we don't want
+ * to waste our time searching for something identical to it there.
+ */
+ checksum = calc_checksum(page);
+ if (rmap_item->oldchecksum != checksum) {
+ rmap_item->oldchecksum = checksum;
+ return;
+ }
+
+ tree_rmap_item =
+ unstable_tree_search_insert(rmap_item, page, &tree_page);
+ if (tree_rmap_item) {
+ kpage = try_to_merge_two_pages(rmap_item, page,
+ tree_rmap_item, tree_page);
+ put_page(tree_page);
+ if (kpage) {
+ /*
+ * The pages were successfully merged: insert new
+ * node in the stable tree and add both rmap_items.
+ */
+ lock_page(kpage);
+ stable_node = stable_tree_insert(kpage);
+ if (stable_node) {
+ stable_tree_append(tree_rmap_item, stable_node);
+ stable_tree_append(rmap_item, stable_node);
+ }
+ unlock_page(kpage);
+
+ /*
+ * If we fail to insert the page into the stable tree,
+ * we will have 2 virtual addresses that are pointing
+ * to a ksm page left outside the stable tree,
+ * in which case we need to break_cow on both.
+ */
+ if (!stable_node) {
+ break_cow(tree_rmap_item);
+ break_cow(rmap_item);
+ }
+ }
+ }
+}
+
+static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
+ struct rmap_item **rmap_list,
+ unsigned long addr)
+{
+ struct rmap_item *rmap_item;
+
+ while (*rmap_list) {
+ rmap_item = *rmap_list;
+ if ((rmap_item->address & PAGE_MASK) == addr)
+ return rmap_item;
+ if (rmap_item->address > addr)
+ break;
+ *rmap_list = rmap_item->rmap_list;
+ remove_rmap_item_from_tree(rmap_item);
+ free_rmap_item(rmap_item);
+ }
+
+ rmap_item = alloc_rmap_item();
+ if (rmap_item) {
+ /* It has already been zeroed */
+ rmap_item->mm = mm_slot->mm;
+ rmap_item->address = addr;
+ rmap_item->rmap_list = *rmap_list;
+ *rmap_list = rmap_item;
+ }
+ return rmap_item;
+}
+
+static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+{
+ struct mm_struct *mm;
+ struct mm_slot *slot;
+ struct vm_area_struct *vma;
+ struct rmap_item *rmap_item;
+ int nid;
+
+ if (list_empty(&ksm_mm_head.mm_list))
+ return NULL;
+
+ slot = ksm_scan.mm_slot;
+ if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
+ /*
+ * Whereas stale stable_nodes on the stable_tree itself
+ * get pruned in the regular course of stable_tree_search(),
+ * those moved out to the migrate_nodes list can accumulate:
+ * so prune them once before each full scan.
+ */
+ if (!ksm_merge_across_nodes) {
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ struct page *page;
+
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this,
+ struct stable_node, list);
+ page = get_ksm_page(stable_node, false);
+ if (page)
+ put_page(page);
+ cond_resched();
+ }
+ }
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++)
+ root_unstable_tree[nid] = RB_ROOT;
+
+ spin_lock(&ksm_mmlist_lock);
+ slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
+ ksm_scan.mm_slot = slot;
+ spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
+next_mm:
+ ksm_scan.address = 0;
+ ksm_scan.rmap_list = &slot->rmap_list;
+ }
+
+ mm = slot->mm;
+ down_read(&mm->mmap_sem);
+ if (ksm_test_exit(mm))
+ vma = NULL;
+ else
+ vma = find_vma(mm, ksm_scan.address);
+
+ for (; vma; vma = vma->vm_next) {
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ continue;
+ if (ksm_scan.address < vma->vm_start)
+ ksm_scan.address = vma->vm_start;
+ if (!vma->anon_vma)
+ ksm_scan.address = vma->vm_end;
+
+ while (ksm_scan.address < vma->vm_end) {
+ if (ksm_test_exit(mm))
+ break;
+ *page = follow_page(vma, ksm_scan.address, FOLL_GET);
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
+ flush_anon_page(vma, *page, ksm_scan.address);
+ flush_dcache_page(*page);
+ rmap_item = get_next_rmap_item(slot,
+ ksm_scan.rmap_list, ksm_scan.address);
+ if (rmap_item) {
+ ksm_scan.rmap_list =
+ &rmap_item->rmap_list;
+ ksm_scan.address += PAGE_SIZE;
+ } else
+ put_page(*page);
+ up_read(&mm->mmap_sem);
+ return rmap_item;
+ }
+ put_page(*page);
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ }
+ }
+
+ if (ksm_test_exit(mm)) {
+ ksm_scan.address = 0;
+ ksm_scan.rmap_list = &slot->rmap_list;
+ }
+ /*
+ * Nuke all the rmap_items that are above this current rmap:
+ * because there were no VM_MERGEABLE vmas with such addresses.
+ */
+ remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ ksm_scan.mm_slot = list_entry(slot->mm_list.next,
+ struct mm_slot, mm_list);
+ if (ksm_scan.address == 0) {
+ /*
+ * We've completed a full scan of all vmas, holding mmap_sem
+ * throughout, and found no VM_MERGEABLE: so do the same as
+ * __ksm_exit does to remove this mm from all our lists now.
+ * This applies either when cleaning up after __ksm_exit
+ * (but beware: we can reach here even before __ksm_exit),
+ * or when all VM_MERGEABLE areas have been unmapped (and
+ * mmap_sem then protects against race with MADV_MERGEABLE).
+ */
+ hash_del(&slot->link);
+ list_del(&slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ free_mm_slot(slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ up_read(&mm->mmap_sem);
+ mmdrop(mm);
+ } else {
+ spin_unlock(&ksm_mmlist_lock);
+ up_read(&mm->mmap_sem);
+ }
+
+ /* Repeat until we've completed scanning the whole list */
+ slot = ksm_scan.mm_slot;
+ if (slot != &ksm_mm_head)
+ goto next_mm;
+
+ ksm_scan.seqnr++;
+ return NULL;
+}
+
+/**
+ * ksm_do_scan - the ksm scanner main worker function.
+ * @scan_npages - number of pages we want to scan before we return.
+ */
+static void ksm_do_scan(unsigned int scan_npages)
+{
+ struct rmap_item *rmap_item;
+ struct page *uninitialized_var(page);
+
+ while (scan_npages-- && likely(!freezing(current))) {
+ cond_resched();
+ rmap_item = scan_get_next_rmap_item(&page);
+ if (!rmap_item)
+ return;
+ cmp_and_merge_page(page, rmap_item);
+ put_page(page);
+ }
+}
+
+static int ksmd_should_run(void)
+{
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+}
+
+static int ksm_scan_thread(void *nothing)
+{
+ set_freezable();
+ set_user_nice(current, 5);
+
+ while (!kthread_should_stop()) {
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksmd_should_run())
+ ksm_do_scan(ksm_thread_pages_to_scan);
+ mutex_unlock(&ksm_thread_mutex);
+
+ try_to_freeze();
+
+ if (ksmd_should_run()) {
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ } else {
+ wait_event_freezable(ksm_thread_wait,
+ ksmd_should_run() || kthread_should_stop());
+ }
+ }
+ return 0;
+}
+
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, int advice, unsigned long *vm_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ switch (advice) {
+ case MADV_MERGEABLE:
+ /*
+ * Be somewhat over-protective for now!
+ */
+ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
+ VM_PFNMAP | VM_IO | VM_DONTEXPAND |
+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+ return 0; /* just ignore the advice */
+
+#ifdef VM_SAO
+ if (*vm_flags & VM_SAO)
+ return 0;
+#endif
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+ err = __ksm_enter(mm);
+ if (err)
+ return err;
+ }
+
+ *vm_flags |= VM_MERGEABLE;
+ break;
+
+ case MADV_UNMERGEABLE:
+ if (!(*vm_flags & VM_MERGEABLE))
+ return 0; /* just ignore the advice */
+
+ if (vma->anon_vma) {
+ err = unmerge_ksm_pages(vma, start, end);
+ if (err)
+ return err;
+ }
+
+ *vm_flags &= ~VM_MERGEABLE;
+ break;
+ }
+
+ return 0;
+}
+
+int __ksm_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int needs_wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* Check ksm_run too? Would need tighter locking */
+ needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+
+ spin_lock(&ksm_mmlist_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * When KSM_RUN_MERGE (or KSM_RUN_STOP),
+ * insert just behind the scanning cursor, to let the area settle
+ * down a little; when fork is followed by immediate exec, we don't
+ * want ksmd to waste time setting up and tearing down an rmap_list.
+ *
+ * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
+ * scanning cursor, otherwise KSM pages in newly forked mms will be
+ * missed: then we might as well insert at the end of the list.
+ */
+ if (ksm_run & KSM_RUN_UNMERGE)
+ list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ else
+ list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ spin_unlock(&ksm_mmlist_lock);
+
+ set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ atomic_inc(&mm->mm_count);
+
+ if (needs_wakeup)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return 0;
+}
+
+void __ksm_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int easy_to_free = 0;
+
+ /*
+ * This process is exiting: if it's straightforward (as is the
+ * case when ksmd was never running), free mm_slot immediately.
+ * But if it's at the cursor or has rmap_items linked to it, use
+ * mmap_sem to synchronize with any break_cows before pagetables
+ * are freed, and leave the mm_slot on the list for ksmd to free.
+ * Beware: ksm may already have noticed it exiting and freed the slot.
+ */
+
+ spin_lock(&ksm_mmlist_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && ksm_scan.mm_slot != mm_slot) {
+ if (!mm_slot->rmap_list) {
+ hash_del(&mm_slot->link);
+ list_del(&mm_slot->mm_list);
+ easy_to_free = 1;
+ } else {
+ list_move(&mm_slot->mm_list,
+ &ksm_scan.mm_slot->mm_list);
+ }
+ }
+ spin_unlock(&ksm_mmlist_lock);
+
+ if (easy_to_free) {
+ free_mm_slot(mm_slot);
+ clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
+}
+
+struct page *ksm_might_need_to_copy(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = page_anon_vma(page);
+ struct page *new_page;
+
+ if (PageKsm(page)) {
+ if (page_stable_node(page) &&
+ !(ksm_run & KSM_RUN_UNMERGE))
+ return page; /* no need to copy it */
+ } else if (!anon_vma) {
+ return page; /* no need to copy it */
+ } else if (anon_vma->root == vma->anon_vma->root &&
+ page->index == linear_page_index(vma, address)) {
+ return page; /* still no need to copy it */
+ }
+ if (!PageUptodate(page))
+ return page; /* let do_swap_page report the error */
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page) {
+ copy_user_highpage(new_page, page, address, vma);
+
+ SetPageDirty(new_page);
+ __SetPageUptodate(new_page);
+ __set_page_locked(new_page);
+ }
+
+ return new_page;
+}
+
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct stable_node *stable_node;
+ struct rmap_item *rmap_item;
+ int ret = SWAP_AGAIN;
+ int search_new_forks = 0;
+
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
+
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ stable_node = page_stable_node(page);
+ if (!stable_node)
+ return ret;
+again:
+ hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
+ struct anon_vma *anon_vma = rmap_item->anon_vma;
+ struct anon_vma_chain *vmac;
+ struct vm_area_struct *vma;
+
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
+ vma = vmac->vma;
+ if (rmap_item->address < vma->vm_start ||
+ rmap_item->address >= vma->vm_end)
+ continue;
+ /*
+ * Initially we examine only the vma which covers this
+ * rmap_item; but later, if there is still work to do,
+ * we examine covering vmas in other mms: in case they
+ * were forked from the original since ksmd passed.
+ */
+ if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ continue;
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg);
+ if (ret != SWAP_AGAIN) {
+ anon_vma_unlock_read(anon_vma);
+ goto out;
+ }
+ if (rwc->done && rwc->done(page)) {
+ anon_vma_unlock_read(anon_vma);
+ goto out;
+ }
+ }
+ anon_vma_unlock_read(anon_vma);
+ }
+ if (!search_new_forks++)
+ goto again;
+out:
+ return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+ struct stable_node *stable_node;
+
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+
+ stable_node = page_stable_node(newpage);
+ if (stable_node) {
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
+ stable_node->kpfn = page_to_pfn(newpage);
+ /*
+ * newpage->mapping was set in advance; now we need smp_wmb()
+ * to make sure that the new stable_node->kpfn is visible
+ * to get_ksm_page() before it can see that oldpage->mapping
+ * has gone stale (or that PageSwapCache has been cleared).
+ */
+ smp_wmb();
+ set_page_stable_node(oldpage, NULL);
+ }
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static int just_wait(void *word)
+{
+ schedule();
+ return 0;
+}
+
+static void wait_while_offlining(void)
+{
+ while (ksm_run & KSM_RUN_OFFLINE) {
+ mutex_unlock(&ksm_thread_mutex);
+ wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
+ just_wait, TASK_UNINTERRUPTIBLE);
+ mutex_lock(&ksm_thread_mutex);
+ }
+}
+
+static void ksm_check_stable_tree(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct stable_node *stable_node;
+ struct list_head *this, *next;
+ struct rb_node *node;
+ int nid;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ node = rb_first(root_stable_tree + nid);
+ while (node) {
+ stable_node = rb_entry(node, struct stable_node, node);
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn) {
+ /*
+ * Don't get_ksm_page, page has already gone:
+ * which is why we keep kpfn instead of page*
+ */
+ remove_node_from_stable_tree(stable_node);
+ node = rb_first(root_stable_tree + nid);
+ } else
+ node = rb_next(node);
+ cond_resched();
+ }
+ }
+ list_for_each_safe(this, next, &migrate_nodes) {
+ stable_node = list_entry(this, struct stable_node, list);
+ if (stable_node->kpfn >= start_pfn &&
+ stable_node->kpfn < end_pfn)
+ remove_node_from_stable_tree(stable_node);
+ cond_resched();
+ }
+}
+
+static int ksm_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ /*
+ * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
+ * and remove_all_stable_nodes() while memory is going offline:
+ * it is unsafe for them to touch the stable tree at this time.
+ * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * which do not need the ksm_thread_mutex are all safe.
+ */
+ mutex_lock(&ksm_thread_mutex);
+ ksm_run |= KSM_RUN_OFFLINE;
+ mutex_unlock(&ksm_thread_mutex);
+ break;
+
+ case MEM_OFFLINE:
+ /*
+ * Most of the work is done by page migration; but there might
+ * be a few stable_nodes left over, still pointing to struct
+ * pages which have been offlined: prune those from the tree,
+ * otherwise get_ksm_page() might later try to access a
+ * non-existent struct page.
+ */
+ ksm_check_stable_tree(mn->start_pfn,
+ mn->start_pfn + mn->nr_pages);
+ /* fallthrough */
+
+ case MEM_CANCEL_OFFLINE:
+ mutex_lock(&ksm_thread_mutex);
+ ksm_run &= ~KSM_RUN_OFFLINE;
+ mutex_unlock(&ksm_thread_mutex);
+
+ smp_mb(); /* wake_up_bit advises this */
+ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
+ break;
+ }
+ return NOTIFY_OK;
+}
+#else
+static void wait_while_offlining(void)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+#ifdef CONFIG_SYSFS
+/*
+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
+ */
+
+#define KSM_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define KSM_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = \
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
+}
+
+static ssize_t sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = kstrtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_sleep_millisecs = msecs;
+
+ return count;
+}
+KSM_ATTR(sleep_millisecs);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
+}
+
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long nr_pages;
+
+ err = kstrtoul(buf, 10, &nr_pages);
+ if (err || nr_pages > UINT_MAX)
+ return -EINVAL;
+
+ ksm_thread_pages_to_scan = nr_pages;
+
+ return count;
+}
+KSM_ATTR(pages_to_scan);
+
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_run);
+}
+
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long flags;
+
+ err = kstrtoul(buf, 10, &flags);
+ if (err || flags > UINT_MAX)
+ return -EINVAL;
+ if (flags > KSM_RUN_UNMERGE)
+ return -EINVAL;
+
+ /*
+ * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
+ * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
+ * breaking COW to free the pages_shared (but leaves mm_slots
+ * on the list for when ksmd may be set running again).
+ */
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_run != flags) {
+ ksm_run = flags;
+ if (flags & KSM_RUN_UNMERGE) {
+ set_current_oom_origin();
+ err = unmerge_and_remove_all_rmap_items();
+ clear_current_oom_origin();
+ if (err) {
+ ksm_run = KSM_RUN_STOP;
+ count = err;
+ }
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ if (flags & KSM_RUN_MERGE)
+ wake_up_interruptible(&ksm_thread_wait);
+
+ return count;
+}
+KSM_ATTR(run);
+
+#ifdef CONFIG_NUMA
+static ssize_t merge_across_nodes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+}
+
+static ssize_t merge_across_nodes_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long knob;
+
+ err = kstrtoul(buf, 10, &knob);
+ if (err)
+ return err;
+ if (knob > 1)
+ return -EINVAL;
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ if (ksm_merge_across_nodes != knob) {
+ if (ksm_pages_shared || remove_all_stable_nodes())
+ err = -EBUSY;
+ else if (root_stable_tree == one_stable_tree) {
+ struct rb_root *buf;
+ /*
+ * This is the first time that we switch away from the
+ * default of merging across nodes: must now allocate
+ * a buffer to hold as many roots as may be needed.
+ * Allocate stable and unstable together:
+ * MAXSMP NODES_SHIFT 10 will use 16kB.
+ */
+ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
+ GFP_KERNEL);
+ /* Let us assume that RB_ROOT is NULL is zero */
+ if (!buf)
+ err = -ENOMEM;
+ else {
+ root_stable_tree = buf;
+ root_unstable_tree = buf + nr_node_ids;
+ /* Stable tree is empty but not the unstable */
+ root_unstable_tree[0] = one_unstable_tree[0];
+ }
+ }
+ if (!err) {
+ ksm_merge_across_nodes = knob;
+ ksm_nr_node_ids = knob ? 1 : nr_node_ids;
+ }
+ }
+ mutex_unlock(&ksm_thread_mutex);
+
+ return err ? err : count;
+}
+KSM_ATTR(merge_across_nodes);
+#endif
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_shared);
+}
+KSM_ATTR_RO(pages_shared);
+
+static ssize_t pages_sharing_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_sharing);
+}
+KSM_ATTR_RO(pages_sharing);
+
+static ssize_t pages_unshared_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_pages_unshared);
+}
+KSM_ATTR_RO(pages_unshared);
+
+static ssize_t pages_volatile_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ long ksm_pages_volatile;
+
+ ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
+ - ksm_pages_sharing - ksm_pages_unshared;
+ /*
+ * It was not worth any locking to calculate that statistic,
+ * but it might therefore sometimes be negative: conceal that.
+ */
+ if (ksm_pages_volatile < 0)
+ ksm_pages_volatile = 0;
+ return sprintf(buf, "%ld\n", ksm_pages_volatile);
+}
+KSM_ATTR_RO(pages_volatile);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+}
+KSM_ATTR_RO(full_scans);
+
+static struct attribute *ksm_attrs[] = {
+ &sleep_millisecs_attr.attr,
+ &pages_to_scan_attr.attr,
+ &run_attr.attr,
+ &pages_shared_attr.attr,
+ &pages_sharing_attr.attr,
+ &pages_unshared_attr.attr,
+ &pages_volatile_attr.attr,
+ &full_scans_attr.attr,
+#ifdef CONFIG_NUMA
+ &merge_across_nodes_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group ksm_attr_group = {
+ .attrs = ksm_attrs,
+ .name = "ksm",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init ksm_init(void)
+{
+ struct task_struct *ksm_thread;
+ int err;
+
+ err = ksm_slab_init();
+ if (err)
+ goto out;
+
+ ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
+ if (IS_ERR(ksm_thread)) {
+ printk(KERN_ERR "ksm: creating kthread failed\n");
+ err = PTR_ERR(ksm_thread);
+ goto out_free;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = sysfs_create_group(mm_kobj, &ksm_attr_group);
+ if (err) {
+ printk(KERN_ERR "ksm: register sysfs failed\n");
+ kthread_stop(ksm_thread);
+ goto out_free;
+ }
+#else
+ ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
+
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ /* There is no significance to this priority 100 */
+ hotplug_memory_notifier(ksm_memory_callback, 100);
+#endif
+ return 0;
+
+out_free:
+ ksm_slab_free();
+out:
+ return err;
+}
+subsys_initcall(ksm_init);
diff --git a/mm/list_lru.c b/mm/list_lru.c
new file mode 100644
index 00000000000..f1a0db19417
--- /dev/null
+++ b/mm/list_lru.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
+ * Authors: David Chinner and Glauber Costa
+ *
+ * Generic LRU infrastructure
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/list_lru.h>
+#include <linux/slab.h>
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ if (list_empty(item)) {
+ list_add_tail(item, &nlru->list);
+ if (nlru->nr_items++ == 0)
+ node_set(nid, lru->active_nodes);
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add);
+
+bool list_lru_del(struct list_lru *lru, struct list_head *item)
+{
+ int nid = page_to_nid(virt_to_page(item));
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ if (!list_empty(item)) {
+ list_del_init(item);
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ spin_unlock(&nlru->lock);
+ return true;
+ }
+ spin_unlock(&nlru->lock);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del);
+
+unsigned long
+list_lru_count_node(struct list_lru *lru, int nid)
+{
+ unsigned long count = 0;
+ struct list_lru_node *nlru = &lru->node[nid];
+
+ spin_lock(&nlru->lock);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ count += nlru->nr_items;
+ spin_unlock(&nlru->lock);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(list_lru_count_node);
+
+unsigned long
+list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+ void *cb_arg, unsigned long *nr_to_walk)
+{
+
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_head *item, *n;
+ unsigned long isolated = 0;
+
+ spin_lock(&nlru->lock);
+restart:
+ list_for_each_safe(item, n, &nlru->list) {
+ enum lru_status ret;
+
+ /*
+ * decrement nr_to_walk first so that we don't livelock if we
+ * get stuck on large numbesr of LRU_RETRY items
+ */
+ if (!*nr_to_walk)
+ break;
+ --*nr_to_walk;
+
+ ret = isolate(item, &nlru->lock, cb_arg);
+ switch (ret) {
+ case LRU_REMOVED_RETRY:
+ assert_spin_locked(&nlru->lock);
+ case LRU_REMOVED:
+ if (--nlru->nr_items == 0)
+ node_clear(nid, lru->active_nodes);
+ WARN_ON_ONCE(nlru->nr_items < 0);
+ isolated++;
+ /*
+ * If the lru lock has been dropped, our list
+ * traversal is now invalid and so we have to
+ * restart from scratch.
+ */
+ if (ret == LRU_REMOVED_RETRY)
+ goto restart;
+ break;
+ case LRU_ROTATE:
+ list_move_tail(item, &nlru->list);
+ break;
+ case LRU_SKIP:
+ break;
+ case LRU_RETRY:
+ /*
+ * The lru lock has been dropped, our list traversal is
+ * now invalid and so we have to restart from scratch.
+ */
+ assert_spin_locked(&nlru->lock);
+ goto restart;
+ default:
+ BUG();
+ }
+ }
+
+ spin_unlock(&nlru->lock);
+ return isolated;
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_node);
+
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+{
+ int i;
+ size_t size = sizeof(*lru->node) * nr_node_ids;
+
+ lru->node = kzalloc(size, GFP_KERNEL);
+ if (!lru->node)
+ return -ENOMEM;
+
+ nodes_clear(lru->active_nodes);
+ for (i = 0; i < nr_node_ids; i++) {
+ spin_lock_init(&lru->node[i].lock);
+ if (key)
+ lockdep_set_class(&lru->node[i].lock, key);
+ INIT_LIST_HEAD(&lru->node[i].list);
+ lru->node[i].nr_items = 0;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(list_lru_init_key);
+
+void list_lru_destroy(struct list_lru *lru)
+{
+ kfree(lru->node);
+}
+EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/maccess.c b/mm/maccess.c
new file mode 100644
index 00000000000..d53adf9ba84
--- /dev/null
+++ b/mm/maccess.c
@@ -0,0 +1,62 @@
+/*
+ * Access kernel memory without faulting.
+ */
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+/**
+ * probe_kernel_read(): safely attempt to read from a location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_kernel_read")));
+
+long __probe_kernel_read(void *dst, const void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst,
+ (__force const void __user *)src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+/**
+ * probe_kernel_write(): safely attempt to write to a location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long __weak probe_kernel_write(void *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_kernel_write")));
+
+long __probe_kernel_write(void *dst, const void *src, size_t size)
+{
+ long ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_write);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4e196155a0c..a402f8fdc68 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,20 +9,47 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
+#include <linux/page-isolation.h>
#include <linux/hugetlb.h>
+#include <linux/falloc.h>
+#include <linux/sched.h>
+#include <linux/ksm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_sem for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static int madvise_need_mmap_write(int behavior)
+{
+ switch (behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ return 0;
+ default:
+ /* be safe, default to 1. list exceptions explicitly */
+ return 1;
+ }
+}
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
-static long madvise_behavior(struct vm_area_struct * vma,
+static long madvise_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
- struct mm_struct * mm = vma->vm_mm;
+ struct mm_struct *mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags;
+ unsigned long new_flags = vma->vm_flags;
switch (behavior) {
case MADV_NORMAL:
@@ -38,8 +65,34 @@ static long madvise_behavior(struct vm_area_struct * vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ goto out;
+ }
new_flags &= ~VM_DONTCOPY;
break;
+ case MADV_DONTDUMP:
+ new_flags |= VM_DONTDUMP;
+ break;
+ case MADV_DODUMP:
+ if (new_flags & VM_SPECIAL) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags &= ~VM_DONTDUMP;
+ break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out;
+ break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -81,19 +134,109 @@ out:
return error;
}
+#ifdef CONFIG_SWAP
+static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
+{
+ pte_t *orig_pte;
+ struct vm_area_struct *vma = walk->private;
+ unsigned long index;
+
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ return 0;
+
+ for (index = start; index != end; index += PAGE_SIZE) {
+ pte_t pte;
+ swp_entry_t entry;
+ struct page *page;
+ spinlock_t *ptl;
+
+ orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ pte = *(orig_pte + ((index - start) / PAGE_SIZE));
+ pte_unmap_unlock(orig_pte, ptl);
+
+ if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+ continue;
+ entry = pte_to_swp_entry(pte);
+ if (unlikely(non_swap_entry(entry)))
+ continue;
+
+ page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+ vma, index);
+ if (page)
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+static void force_swapin_readahead(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct mm_walk walk = {
+ .mm = vma->vm_mm,
+ .pmd_entry = swapin_walk_pmd_entry,
+ .private = vma,
+ };
+
+ walk_page_range(start, end, &walk);
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+}
+
+static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct address_space *mapping)
+{
+ pgoff_t index;
+ struct page *page;
+ swp_entry_t swap;
+
+ for (; start < end; start += PAGE_SIZE) {
+ index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ page = find_get_entry(mapping, index);
+ if (!radix_tree_exceptional_entry(page)) {
+ if (page)
+ page_cache_release(page);
+ continue;
+ }
+ swap = radix_to_swp_entry(page);
+ page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
+ NULL, 0);
+ if (page)
+ page_cache_release(page);
+ }
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+}
+#endif /* CONFIG_SWAP */
+
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct * vma,
- struct vm_area_struct ** prev,
+static long madvise_willneed(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
+#ifdef CONFIG_SWAP
+ if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+ *prev = vma;
+ if (!file)
+ force_swapin_readahead(vma, start, end);
+ else
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+#endif
+
if (!file)
return -EBADF;
- if (file->f_mapping->a_ops->get_xip_page) {
+ if (file->f_mapping->a_ops->get_xip_mem) {
/* no bad return value, but ignore advice */
return 0;
}
@@ -104,8 +247,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
end = vma->vm_end;
end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- force_page_cache_readahead(file->f_mapping,
- file, start, max_sane_readahead(end - start));
+ force_page_cache_readahead(file->f_mapping, file, start, end - start);
return 0;
}
@@ -113,10 +255,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for refill_inactive to actually free
+ * zap_page_range call sets things up for shrink_active_list to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
+ * shrink_active_list to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
@@ -128,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed(struct vm_area_struct * vma,
- struct vm_area_struct ** prev,
+static long madvise_dontneed(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
*prev = vma;
@@ -155,66 +297,127 @@ static long madvise_dontneed(struct vm_area_struct * vma,
* Other filesystems return -ENOSYS.
*/
static long madvise_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
- struct address_space *mapping;
- loff_t offset, endoff;
+ loff_t offset;
+ int error;
+ struct file *f;
+
+ *prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
- if (!vma->vm_file || !vma->vm_file->f_mapping
- || !vma->vm_file->f_mapping->host) {
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
return -EACCES;
- mapping = vma->vm_file->f_mapping;
-
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- endoff = (loff_t)(end - vma->vm_start - 1)
- + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- return vmtruncate_range(mapping->host, offset, endoff);
+
+ /*
+ * Filesystem's fallocate may need to take i_mutex. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ get_file(f);
+ up_read(&current->mm->mmap_sem);
+ error = do_fallocate(f,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - start);
+ fput(f);
+ down_read(&current->mm->mmap_sem);
+ return error;
}
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Error injection support for memory error handling.
+ */
+static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
+{
+ struct page *p;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ for (; start < end; start += PAGE_SIZE <<
+ compound_order(compound_head(p))) {
+ int ret;
+
+ ret = get_user_pages_fast(start, 1, 0, &p);
+ if (ret != 1)
+ return ret;
+
+ if (PageHWPoison(p)) {
+ put_page(p);
+ continue;
+ }
+ if (bhv == MADV_SOFT_OFFLINE) {
+ pr_info("Soft offlining page %#lx at %#lx\n",
+ page_to_pfn(p), start);
+ ret = soft_offline_page(p, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
+ continue;
+ }
+ pr_info("Injecting memory failure for page %#lx at %#lx\n",
+ page_to_pfn(p), start);
+ /* Ignore return value for now */
+ memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ }
+ return 0;
+}
+#endif
+
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
- long error;
+ switch (behavior) {
+ case MADV_REMOVE:
+ return madvise_remove(vma, prev, start, end);
+ case MADV_WILLNEED:
+ return madvise_willneed(vma, prev, start, end);
+ case MADV_DONTNEED:
+ return madvise_dontneed(vma, prev, start, end);
+ default:
+ return madvise_behavior(vma, prev, start, end, behavior);
+ }
+}
+static int
+madvise_behavior_valid(int behavior)
+{
switch (behavior) {
case MADV_DOFORK:
- if (vma->vm_flags & VM_IO) {
- error = -EINVAL;
- break;
- }
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
- error = madvise_behavior(vma, prev, start, end, behavior);
- break;
case MADV_REMOVE:
- error = madvise_remove(vma, start, end);
- break;
-
case MADV_WILLNEED:
- error = madvise_willneed(vma, prev, start, end);
- break;
-
case MADV_DONTNEED:
- error = madvise_dontneed(vma, prev, start, end);
- break;
+#ifdef CONFIG_KSM
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
+ case MADV_DONTDUMP:
+ case MADV_DODUMP:
+ return 1;
default:
- error = -EINVAL;
- break;
+ return 0;
}
- return error;
}
/*
@@ -241,6 +444,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* so the kernel can free resources associated with it.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
+ * MADV_DONTFORK - omit this area from child's address space when forking:
+ * typically, to avoid COWing pages pinned by get_user_pages().
+ * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
+ * this area with pages of identical content from other such areas.
+ * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
*
* return values:
* zero - success
@@ -253,31 +462,44 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
unsigned long end, tmp;
- struct vm_area_struct * vma, *prev;
+ struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
+ int write;
size_t len;
+ struct blk_plug plug;
- down_write(&current->mm->mmap_sem);
+#ifdef CONFIG_MEMORY_FAILURE
+ if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+ return madvise_hwpoison(behavior, start, start+len_in);
+#endif
+ if (!madvise_behavior_valid(behavior))
+ return error;
if (start & ~PAGE_MASK)
- goto out;
+ return error;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- goto out;
+ return error;
end = start + len;
if (end < start)
- goto out;
+ return error;
error = 0;
if (end == start)
- goto out;
+ return error;
+
+ write = madvise_need_mmap_write(behavior);
+ if (write)
+ down_write(&current->mm->mmap_sem);
+ else
+ down_read(&current->mm->mmap_sem);
/*
* If the interval [start,end) covers some unmapped address
@@ -288,6 +510,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
if (vma && start > vma->vm_start)
prev = vma;
+ blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
@@ -312,14 +535,22 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
if (error)
goto out;
start = tmp;
- if (start < prev->vm_end)
+ if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out;
- vma = prev->vm_next;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
}
out:
- up_write(&current->mm->mmap_sem);
+ blk_finish_plug(&plug);
+ if (write)
+ up_write(&current->mm->mmap_sem);
+ else
+ up_read(&current->mm->mmap_sem);
+
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
new file mode 100644
index 00000000000..6d2f219a48b
--- /dev/null
+++ b/mm/memblock.c
@@ -0,0 +1,1591 @@
+/*
+ * Procedures for maintaining information about logical memory blocks.
+ *
+ * Peter Bergner, IBM Corp. June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/memblock.h>
+
+#include <asm-generic/sections.h>
+#include <linux/io.h>
+
+#include "internal.h"
+
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+#endif
+
+struct memblock memblock __initdata_memblock = {
+ .memory.regions = memblock_memory_init_regions,
+ .memory.cnt = 1, /* empty dummy entry */
+ .memory.max = INIT_MEMBLOCK_REGIONS,
+
+ .reserved.regions = memblock_reserved_init_regions,
+ .reserved.cnt = 1, /* empty dummy entry */
+ .reserved.max = INIT_MEMBLOCK_REGIONS,
+
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ .physmem.regions = memblock_physmem_init_regions,
+ .physmem.cnt = 1, /* empty dummy entry */
+ .physmem.max = INIT_PHYSMEM_REGIONS,
+#endif
+
+ .bottom_up = false,
+ .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
+};
+
+int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
+static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
+
+/* inline so we don't get a warning when pr_debug is compiled out */
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
+{
+ if (type == &memblock.memory)
+ return "memory";
+ else if (type == &memblock.reserved)
+ return "reserved";
+ else
+ return "unknown";
+}
+
+/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
+static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
+{
+ return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
+}
+
+/*
+ * Address comparison utilities
+ */
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
+
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
+ }
+
+ return (i < type->cnt) ? i : -1;
+}
+
+/*
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ cand = round_up(this_start, align);
+ if (cand < this_end && this_end - cand >= size)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * __memblock_find_range_top_down - find free area utility, in top-down
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area top-down.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ if (this_end < size)
+ continue;
+
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * memblock_find_in_range_node - find free area in given range and node
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Find @size free area aligned to @align in the specified range and node.
+ *
+ * When allocation direction is bottom-up, the @start should be greater
+ * than the end of the kernel image. Otherwise, it will be trimmed. The
+ * reason is that we want the bottom-up allocation just near the kernel
+ * image so it is highly likely that the allocated memory and the kernel
+ * will reside in the same node.
+ *
+ * If bottom-up allocation failed, will try to allocate memory top-down.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
+{
+ int ret;
+ phys_addr_t kernel_end;
+
+ /* pump up @end */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
+
+ /* avoid allocating the first page */
+ start = max_t(phys_addr_t, start, PAGE_SIZE);
+ end = max(start, end);
+ kernel_end = __pa_symbol(_end);
+
+ /*
+ * try bottom-up allocation only when bottom-up mode
+ * is set and @end is above the kernel image.
+ */
+ if (memblock_bottom_up() && end > kernel_end) {
+ phys_addr_t bottom_up_start;
+
+ /* make sure we will allocate above the kernel */
+ bottom_up_start = max(start, kernel_end);
+
+ /* ok, try bottom-up allocation first */
+ ret = __memblock_find_range_bottom_up(bottom_up_start, end,
+ size, align, nid);
+ if (ret)
+ return ret;
+
+ /*
+ * we always limit bottom-up allocation above the kernel,
+ * but top-down allocation doesn't have the limit, so
+ * retrying top-down allocation may succeed when bottom-up
+ * allocation failed.
+ *
+ * bottom-up allocation is expected to be fail very rarely,
+ * so we use WARN_ONCE() here to see the stack trace if
+ * fail happens.
+ */
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, "
+ "memory hotunplug may be affected\n");
+ }
+
+ return __memblock_find_range_top_down(start, end, size, align, nid);
+}
+
+/**
+ * memblock_find_in_range - find free area in given range
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ *
+ * Find @size free area aligned to @align in the specified range.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+ phys_addr_t end, phys_addr_t size,
+ phys_addr_t align)
+{
+ return memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE);
+}
+
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
+{
+ type->total_size -= type->regions[r].size;
+ memmove(&type->regions[r], &type->regions[r + 1],
+ (type->cnt - (r + 1)) * sizeof(type->regions[r]));
+ type->cnt--;
+
+ /* Special case for empty arrays */
+ if (type->cnt == 0) {
+ WARN_ON(type->total_size != 0);
+ type->cnt = 1;
+ type->regions[0].base = 0;
+ type->regions[0].size = 0;
+ type->regions[0].flags = 0;
+ memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
+ }
+}
+
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.reserved.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+}
+
+phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.memory.regions == memblock_memory_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.memory.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+}
+
+#endif
+
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+ phys_addr_t new_area_start,
+ phys_addr_t new_area_size)
+{
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_alloc_size, new_alloc_size;
+ phys_addr_t old_size, new_size, addr;
+ int use_slab = slab_is_available();
+ int *in_slab;
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
+ */
+ if (!memblock_can_resize)
+ return -1;
+
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+ /*
+ * We need to allocated new one align to PAGE_SIZE,
+ * so we can free them completely later.
+ */
+ old_alloc_size = PAGE_ALIGN(old_size);
+ new_alloc_size = PAGE_ALIGN(new_size);
+
+ /* Retrieve the slab flag */
+ if (type == &memblock.memory)
+ in_slab = &memblock_memory_in_slab;
+ else
+ in_slab = &memblock_reserved_in_slab;
+
+ /* Try to find some space for it.
+ *
+ * WARNING: We assume that either slab_is_available() and we use it or
+ * we use MEMBLOCK for allocations. That means that this is unsafe to
+ * use when bootmem is currently active (unless bootmem itself is
+ * implemented on top of MEMBLOCK which isn't the case yet)
+ *
+ * This should however not be an issue for now, as we currently only
+ * call into MEMBLOCK while it's still active, or much later when slab
+ * is active for memory hotplug operations
+ */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array ? __pa(new_array) : 0;
+ } else {
+ /* only exclude range when trying to double reserved.regions */
+ if (type != &memblock.reserved)
+ new_area_start = new_area_size = 0;
+
+ addr = memblock_find_in_range(new_area_start + new_area_size,
+ memblock.current_limit,
+ new_alloc_size, PAGE_SIZE);
+ if (!addr && new_area_size)
+ addr = memblock_find_in_range(0,
+ min(new_area_start, memblock.current_limit),
+ new_alloc_size, PAGE_SIZE);
+
+ new_array = addr ? __va(addr) : NULL;
+ }
+ if (!addr) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ memblock_type_name(type), type->max, type->max * 2);
+ return -1;
+ }
+
+ memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr,
+ (u64)addr + new_size - 1);
+
+ /*
+ * Found space, we now need to move the array over before we add the
+ * reserved region since it may be our reserved array itself that is
+ * full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* Free old array. We needn't free it if the array is the static one */
+ if (*in_slab)
+ kfree(old_array);
+ else if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_alloc_size);
+
+ /*
+ * Reserve the new array if that comes from the memblock. Otherwise, we
+ * needn't do it
+ */
+ if (!use_slab)
+ BUG_ON(memblock_reserve(addr, new_alloc_size));
+
+ /* Update slab flag */
+ *in_slab = use_slab;
+
+ return 0;
+}
+
+/**
+ * memblock_merge_regions - merge neighboring compatible regions
+ * @type: memblock type to scan
+ *
+ * Scan @type and merge neighboring compatible regions.
+ */
+static void __init_memblock memblock_merge_regions(struct memblock_type *type)
+{
+ int i = 0;
+
+ /* cnt never goes below 1 */
+ while (i < type->cnt - 1) {
+ struct memblock_region *this = &type->regions[i];
+ struct memblock_region *next = &type->regions[i + 1];
+
+ if (this->base + this->size != next->base ||
+ memblock_get_region_node(this) !=
+ memblock_get_region_node(next) ||
+ this->flags != next->flags) {
+ BUG_ON(this->base + this->size > next->base);
+ i++;
+ continue;
+ }
+
+ this->size += next->size;
+ /* move forward from next + 1, index of which is i + 2 */
+ memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
+ type->cnt--;
+ }
+}
+
+/**
+ * memblock_insert_region - insert new memblock region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: node id of the new region
+ * @flags: flags of the new region
+ *
+ * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * @type must already have extra room to accomodate the new region.
+ */
+static void __init_memblock memblock_insert_region(struct memblock_type *type,
+ int idx, phys_addr_t base,
+ phys_addr_t size,
+ int nid, unsigned long flags)
+{
+ struct memblock_region *rgn = &type->regions[idx];
+
+ BUG_ON(type->cnt >= type->max);
+ memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
+ rgn->base = base;
+ rgn->size = size;
+ rgn->flags = flags;
+ memblock_set_region_node(rgn, nid);
+ type->cnt++;
+ type->total_size += size;
+}
+
+/**
+ * memblock_add_range - add new memblock region
+ * @type: memblock type to add new region into
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ * @flags: flags of the new region
+ *
+ * Add new memblock region [@base,@base+@size) into @type. The new region
+ * is allowed to overlap with existing ones - overlaps don't affect already
+ * existing regions. @type is guaranteed to be minimal (all neighbouring
+ * compatible regions are merged) after the addition.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_add_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int nid, unsigned long flags)
+{
+ bool insert = false;
+ phys_addr_t obase = base;
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+ int i, nr_new;
+
+ if (!size)
+ return 0;
+
+ /* special case for empty array */
+ if (type->regions[0].size == 0) {
+ WARN_ON(type->cnt != 1 || type->total_size);
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ type->regions[0].flags = flags;
+ memblock_set_region_node(&type->regions[0], nid);
+ type->total_size = size;
+ return 0;
+ }
+repeat:
+ /*
+ * The following is executed twice. Once with %false @insert and
+ * then with %true. The first counts the number of regions needed
+ * to accomodate the new area. The second actually inserts them.
+ */
+ base = obase;
+ nr_new = 0;
+
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
+
+ if (rbase >= end)
+ break;
+ if (rend <= base)
+ continue;
+ /*
+ * @rgn overlaps. If it separates the lower part of new
+ * area, insert that portion.
+ */
+ if (rbase > base) {
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, i++, base,
+ rbase - base, nid,
+ flags);
+ }
+ /* area below @rend is dealt with, forget about it */
+ base = min(rend, end);
+ }
+
+ /* insert the remaining portion */
+ if (base < end) {
+ nr_new++;
+ if (insert)
+ memblock_insert_region(type, i, base, end - base,
+ nid, flags);
+ }
+
+ /*
+ * If this was the first round, resize array and repeat for actual
+ * insertions; otherwise, merge and return.
+ */
+ if (!insert) {
+ while (type->cnt + nr_new > type->max)
+ if (memblock_double_array(type, obase, size) < 0)
+ return -ENOMEM;
+ insert = true;
+ goto repeat;
+ } else {
+ memblock_merge_regions(type);
+ return 0;
+ }
+}
+
+int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
+ int nid)
+{
+ return memblock_add_range(&memblock.memory, base, size, nid, 0);
+}
+
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_add_range(&memblock.memory, base, size,
+ MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_isolate_range - isolate given range into disjoint memblocks
+ * @type: memblock type to isolate range for
+ * @base: base of range to isolate
+ * @size: size of range to isolate
+ * @start_rgn: out parameter for the start of isolated region
+ * @end_rgn: out parameter for the end of isolated region
+ *
+ * Walk @type and ensure that regions don't cross the boundaries defined by
+ * [@base,@base+@size). Crossing regions are split at the boundaries,
+ * which may create at most two more regions. The index of the first
+ * region inside the range is returned in *@start_rgn and end in *@end_rgn.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_isolate_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int *start_rgn, int *end_rgn)
+{
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+ int i;
+
+ *start_rgn = *end_rgn = 0;
+
+ if (!size)
+ return 0;
+
+ /* we'll create at most two more regions */
+ while (type->cnt + 2 > type->max)
+ if (memblock_double_array(type, base, size) < 0)
+ return -ENOMEM;
+
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rbase = rgn->base;
+ phys_addr_t rend = rbase + rgn->size;
+
+ if (rbase >= end)
+ break;
+ if (rend <= base)
+ continue;
+
+ if (rbase < base) {
+ /*
+ * @rgn intersects from below. Split and continue
+ * to process the next region - the new top half.
+ */
+ rgn->base = base;
+ rgn->size -= base - rbase;
+ type->total_size -= base - rbase;
+ memblock_insert_region(type, i, rbase, base - rbase,
+ memblock_get_region_node(rgn),
+ rgn->flags);
+ } else if (rend > end) {
+ /*
+ * @rgn intersects from above. Split and redo the
+ * current region - the new bottom half.
+ */
+ rgn->base = end;
+ rgn->size -= end - rbase;
+ type->total_size -= end - rbase;
+ memblock_insert_region(type, i--, rbase, end - rbase,
+ memblock_get_region_node(rgn),
+ rgn->flags);
+ } else {
+ /* @rgn is fully contained, record it */
+ if (!*end_rgn)
+ *start_rgn = i;
+ *end_rgn = i + 1;
+ }
+ }
+
+ return 0;
+}
+
+int __init_memblock memblock_remove_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = end_rgn - 1; i >= start_rgn; i--)
+ memblock_remove_region(type, i);
+ return 0;
+}
+
+int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_remove_range(&memblock.memory, base, size);
+}
+
+
+int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size - 1,
+ (void *)_RET_IP_);
+
+ kmemleak_free_part(__va(base), size);
+ return memblock_remove_range(&memblock.reserved, base, size);
+}
+
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
+{
+ struct memblock_type *_rgn = &memblock.reserved;
+
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
+
+ return memblock_add_range(_rgn, base, size, nid, flags);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_clear_region_flags(&type->regions[i],
+ MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * __next__mem_range - next function for for_each_free_mem_range() etc.
+ * @idx: pointer to u64 loop variable
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Find the first area from *@idx which matches @nid, fill the out
+ * parameters, and update *@idx for the next iteration. The lower 32bit of
+ * *@idx contains index into type_a and the upper 32bit indexes the
+ * areas before each region in type_b. For example, if type_b regions
+ * look like the following,
+ *
+ * 0:[0-16), 1:[32-48), 2:[128-130)
+ *
+ * The upper 32bit indexes the following regions.
+ *
+ * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
+ *
+ * As both region arrays are sorted, the function advances the two indices
+ * in lockstep and returns each intersection.
+ */
+void __init_memblock __next_mem_range(u64 *idx, int nid,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES,
+ "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ for (; idx_a < type_a->cnt; idx_a++) {
+ struct memblock_region *m = &type_a->regions[idx_a];
+
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
+
+ /* only memory regions are associated with nodes, check it */
+ if (nid != NUMA_NO_NODE && nid != m_nid)
+ continue;
+
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+
+ /* scan areas before each reservation */
+ for (; idx_b < type_b->cnt + 1; idx_b++) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : ULLONG_MAX;
+
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
+ if (r_start >= m_end)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_start < r_end) {
+ if (out_start)
+ *out_start =
+ max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = m_nid;
+ /*
+ * The region which ends first is
+ * advanced for the next iteration.
+ */
+ if (m_end <= r_end)
+ idx_a++;
+ else
+ idx_b++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+ }
+ }
+
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+/**
+ * __next_mem_range_rev - generic next function for for_each_*_range_rev()
+ *
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
+ * @idx: pointer to u64 loop variable
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Reverse of __next_mem_range().
+ */
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
+{
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ if (*idx == (u64)ULLONG_MAX) {
+ idx_a = type_a->cnt - 1;
+ idx_b = type_b->cnt;
+ }
+
+ for (; idx_a >= 0; idx_a--) {
+ struct memblock_region *m = &type_a->regions[idx_a];
+
+ phys_addr_t m_start = m->base;
+ phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
+
+ /* only memory regions are associated with nodes, check it */
+ if (nid != NUMA_NO_NODE && nid != m_nid)
+ continue;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ continue;
+
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+
+ /* scan areas before each reservation */
+ for (; idx_b >= 0; idx_b--) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : ULLONG_MAX;
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
+
+ if (r_end <= m_start)
+ break;
+ /* if the two regions intersect, we're done */
+ if (m_end > r_start) {
+ if (out_start)
+ *out_start = max(m_start, r_start);
+ if (out_end)
+ *out_end = min(m_end, r_end);
+ if (out_nid)
+ *out_nid = m_nid;
+ if (m_start >= r_start)
+ idx_a--;
+ else
+ idx_b--;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+ }
+ }
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+/*
+ * Common iterator interface used to define for_each_mem_range().
+ */
+void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+ unsigned long *out_start_pfn,
+ unsigned long *out_end_pfn, int *out_nid)
+{
+ struct memblock_type *type = &memblock.memory;
+ struct memblock_region *r;
+
+ while (++*idx < type->cnt) {
+ r = &type->regions[*idx];
+
+ if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
+ continue;
+ if (nid == MAX_NUMNODES || nid == r->nid)
+ break;
+ }
+ if (*idx >= type->cnt) {
+ *idx = -1;
+ return;
+ }
+
+ if (out_start_pfn)
+ *out_start_pfn = PFN_UP(r->base);
+ if (out_end_pfn)
+ *out_end_pfn = PFN_DOWN(r->base + r->size);
+ if (out_nid)
+ *out_nid = r->nid;
+}
+
+/**
+ * memblock_set_node - set node ID on memblock regions
+ * @base: base of area to set node ID for
+ * @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
+ * @nid: node ID to set
+ *
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
+ * Regions which cross the area boundaries are split as necessary.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
+ struct memblock_type *type, int nid)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_node(&type->regions[i], nid);
+
+ memblock_merge_regions(type);
+ return 0;
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
+{
+ phys_addr_t found;
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ found = memblock_find_in_range_node(size, align, start, end, nid);
+ if (found && !memblock_reserve(found, size)) {
+ /*
+ * The min_count is set to 0 so that memblock allocations are
+ * never reported as leaks.
+ */
+ kmemleak_alloc(__va(found), size, 0, 0);
+ return found;
+ }
+ return 0;
+}
+
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+ phys_addr_t start, phys_addr_t end)
+{
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+}
+
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t max_addr,
+ int nid)
+{
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+}
+
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
+
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+}
+
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+ phys_addr_t alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
+}
+
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+}
+
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t res = memblock_alloc_nid(size, align, nid);
+
+ if (res)
+ return res;
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+}
+
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+ void *ptr;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of free_all_bootmem)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
+again:
+ alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+ nid);
+ if (alloc)
+ goto done;
+
+ if (nid != NUMA_NO_NODE) {
+ alloc = memblock_find_in_range_node(size, align, min_addr,
+ max_addr, NUMA_NO_NODE);
+ if (alloc)
+ goto done;
+ }
+
+ if (min_addr) {
+ min_addr = 0;
+ goto again;
+ } else {
+ goto error;
+ }
+
+done:
+ memblock_reserve(alloc, size);
+ ptr = phys_to_virt(alloc);
+ memset(ptr, 0, size);
+
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks. This is because many of these blocks
+ * are only referred via the physical address which is not
+ * looked up by kmemleak.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+
+ return ptr;
+
+error:
+ return NULL;
+}
+
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ return memblock_virt_alloc_internal(size, align, min_addr,
+ max_addr, nid);
+}
+
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+ if (ptr)
+ return ptr;
+
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr);
+ return NULL;
+}
+
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ memblock_remove_range(&memblock.reserved, base, size);
+}
+
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ u64 cursor, end;
+
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+/*
+ * Remaining API functions
+ */
+
+phys_addr_t __init memblock_phys_mem_size(void)
+{
+ return memblock.memory.total_size;
+}
+
+phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
+{
+ unsigned long pages = 0;
+ struct memblock_region *r;
+ unsigned long start_pfn, end_pfn;
+
+ for_each_memblock(memory, r) {
+ start_pfn = memblock_region_memory_base_pfn(r);
+ end_pfn = memblock_region_memory_end_pfn(r);
+ start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+ end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+ pages += end_pfn - start_pfn;
+ }
+
+ return PFN_PHYS(pages);
+}
+
+/* lowest address */
+phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+{
+ return memblock.memory.regions[0].base;
+}
+
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
+{
+ int idx = memblock.memory.cnt - 1;
+
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
+}
+
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
+{
+ phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ struct memblock_region *r;
+
+ if (!limit)
+ return;
+
+ /* find out max address */
+ for_each_memblock(memory, r) {
+ if (limit <= r->size) {
+ max_addr = r->base + limit;
+ break;
+ }
+ limit -= r->size;
+ }
+
+ /* truncate both memory and reserved regions */
+ memblock_remove_range(&memblock.memory, max_addr,
+ (phys_addr_t)ULLONG_MAX);
+ memblock_remove_range(&memblock.reserved, max_addr,
+ (phys_addr_t)ULLONG_MAX);
+}
+
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ struct memblock_type *type = &memblock.memory;
+ int mid = memblock_search(type, PFN_PHYS(pfn));
+
+ if (mid == -1)
+ return -1;
+
+ *start_pfn = PFN_DOWN(type->regions[mid].base);
+ *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
+
+ return type->regions[mid].nid;
+}
+#endif
+
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.memory, base);
+ phys_addr_t end = base + memblock_cap_size(base, &size);
+
+ if (idx == -1)
+ return 0;
+ return memblock.memory.regions[idx].base <= base &&
+ (memblock.memory.regions[idx].base +
+ memblock.memory.regions[idx].size) >= end;
+}
+
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ memblock_cap_size(base, &size);
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+ phys_addr_t start, end, orig_start, orig_end;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ orig_start = r->base;
+ orig_end = r->base + r->size;
+ start = round_up(orig_start, align);
+ end = round_down(orig_end, align);
+
+ if (start == orig_start && end == orig_end)
+ continue;
+
+ if (start < end) {
+ r->base = start;
+ r->size = end - start;
+ } else {
+ memblock_remove_region(&memblock.memory,
+ r - memblock.memory.regions);
+ r--;
+ }
+ }
+}
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
+{
+ memblock.current_limit = limit;
+}
+
+phys_addr_t __init_memblock memblock_get_current_limit(void)
+{
+ return memblock.current_limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
+{
+ unsigned long long base, size;
+ unsigned long flags;
+ int i;
+
+ pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
+
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ char nid_buf[32] = "";
+
+ base = rgn->base;
+ size = rgn->size;
+ flags = rgn->flags;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+ snprintf(nid_buf, sizeof(nid_buf), " on node %d",
+ memblock_get_region_node(rgn));
+#endif
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
+ name, i, base, base + size - 1, size, nid_buf, flags);
+ }
+}
+
+void __init_memblock __memblock_dump_all(void)
+{
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = %#llx reserved size = %#llx\n",
+ (unsigned long long)memblock.memory.total_size,
+ (unsigned long long)memblock.reserved.total_size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
+}
+
+void __init memblock_allow_resize(void)
+{
+ memblock_can_resize = 1;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ seq_printf(m, "%4d: ", i);
+ if (sizeof(phys_addr_t) == 4)
+ seq_printf(m, "0x%08lx..0x%08lx\n",
+ (unsigned long)reg->base,
+ (unsigned long)(reg->base + reg->size - 1));
+ else
+ seq_printf(m, "0x%016llx..0x%016llx\n",
+ (unsigned long long)reg->base,
+ (unsigned long long)(reg->base + reg->size - 1));
+
+ }
+ return 0;
+}
+
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, memblock_debug_show, inode->i_private);
+}
+
+static const struct file_operations memblock_debug_fops = {
+ .open = memblock_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
+#endif
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 00000000000..1f14a430c65
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,7080 @@
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * Memory thresholds
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp.h>
+#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/limits.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/spinlock.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/vmpressure.h>
+#include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
+#include <linux/cpu.h>
+#include <linux/oom.h>
+#include <linux/lockdep.h>
+#include <linux/file.h>
+#include "internal.h"
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp_memcontrol.h>
+#include "slab.h"
+
+#include <asm/uaccess.h>
+
+#include <trace/events/vmscan.h>
+
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
+
+#define MEM_CGROUP_RECLAIM_RETRIES 5
+static struct mem_cgroup *root_mem_cgroup __read_mostly;
+
+#ifdef CONFIG_MEMCG_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
+int do_swap_account __read_mostly;
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+#else
+#define do_swap_account 0
+#endif
+
+
+static const char * const mem_cgroup_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "mapped_file",
+ "writeback",
+ "swap",
+};
+
+enum mem_cgroup_events_index {
+ MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
+ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
+ MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
+ MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
+ MEM_CGROUP_EVENTS_NSTATS,
+};
+
+static const char * const mem_cgroup_events_names[] = {
+ "pgpgin",
+ "pgpgout",
+ "pgfault",
+ "pgmajfault",
+};
+
+static const char * const mem_cgroup_lru_names[] = {
+ "inactive_anon",
+ "active_anon",
+ "inactive_file",
+ "active_file",
+ "unevictable",
+};
+
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
+ MEM_CGROUP_NTARGETS,
+};
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+#define NUMAINFO_EVENTS_TARGET 1024
+
+struct mem_cgroup_stat_cpu {
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+ /*
+ * last scanned hierarchy member. Valid only if last_dead_count
+ * matches memcg->dead_count of the hierarchy root group.
+ */
+ struct mem_cgroup *last_visited;
+ int last_dead_count;
+
+ /* scan generation, increased every round-trip */
+ unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+ struct lruvec lruvec;
+ unsigned long lru_size[NR_LRU_LISTS];
+
+ struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
+ /* use container_of */
+};
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+ struct rb_root rb_root;
+ spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+ struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+ struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
+struct mem_cgroup_threshold {
+ struct eventfd_ctx *eventfd;
+ u64 threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+ /* An array index points to threshold just below or equal to usage. */
+ int current_threshold;
+ /* Size of entries[] */
+ unsigned int size;
+ /* Array of thresholds */
+ struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/* for OOM */
+struct mem_cgroup_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ *
+ * TODO: Add a water mark for the memory controller. Reclaim will begin when
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for memory usage
+ */
+ struct res_counter res;
+
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
+ /*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
+
+ /*
+ * the counter to account for kernel memory usage.
+ */
+ struct res_counter kmem;
+ /*
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+ unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
+
+ bool oom_lock;
+ atomic_t under_oom;
+ atomic_t oom_wakeups;
+
+ int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
+
+ /* set when res.limit == memsw.limit */
+ bool memsw_is_minimum;
+
+ /* protect arrays of thresholds */
+ struct mutex thresholds_lock;
+
+ /* thresholds for memory usage. RCU-protected */
+ struct mem_cgroup_thresholds thresholds;
+
+ /* thresholds for mem+swap usage. RCU-protected */
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+
+ /*
+ * Should we move charges of a task when a task is moved into this
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+ /*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+ /* taken only while moving_account > 0 */
+ spinlock_t move_lock;
+ /*
+ * percpu counter.
+ */
+ struct mem_cgroup_stat_cpu __percpu *stat;
+ /*
+ * used when a cpu is offlined or other synchronizations
+ * See mem_cgroup_read_stat().
+ */
+ struct mem_cgroup_stat_cpu nocpu_base;
+ spinlock_t pcp_counter_lock;
+
+ atomic_t dead_count;
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+ struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* analogous to slab_common's slab_caches list, but per-memcg;
+ * protected by memcg_slab_mutex */
+ struct list_head memcg_slab_caches;
+ /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ int kmemcg_id;
+#endif
+
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
+};
+
+/* internal only representation about the status of kmem accounting. */
+enum {
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
+};
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+ /*
+ * Our caller must use css_get() first, because memcg_uncharge_kmem()
+ * will call css_put() if it sees the memcg is dead.
+ */
+ smp_wmb();
+ if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+ set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+ return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+ &memcg->kmem_account_flags);
+}
+#endif
+
+/* Stuffs for move charges at task migration. */
+/*
+ * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ */
+enum move_type {
+ MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
+ MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
+ NR_MOVE_TYPE,
+};
+
+/* "mc" and its members are protected by cgroup_mutex */
+static struct move_charge_struct {
+ spinlock_t lock; /* for from, to */
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ unsigned long immigrate_flags;
+ unsigned long precharge;
+ unsigned long moved_charge;
+ unsigned long moved_swap;
+ struct task_struct *moving_task; /* a task moving charges */
+ wait_queue_head_t waitq; /* a waitq for other context */
+} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
+ .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
+};
+
+static bool move_anon(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
+}
+
+static bool move_file(void)
+{
+ return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
+}
+
+/*
+ * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
+ * limit reclaim to prevent infinite loops, if they ever occur.
+ */
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
+
+enum charge_type {
+ MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
+ MEM_CGROUP_CHARGE_TYPE_ANON,
+ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
+ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
+ NR_CHARGE_TYPE,
+};
+
+/* for encoding cft->private value on file */
+enum res_type {
+ _MEM,
+ _MEMSWAP,
+ _OOM_TYPE,
+ _KMEM,
+};
+
+#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
+#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+/* Used for OOM nofiier */
+#define OOM_CONTROL (0)
+
+/*
+ * Reclaim flags for mem_cgroup_hierarchical_reclaim
+ */
+#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
+#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
+#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
+#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+
+/*
+ * The memcg_create_mutex will be held whenever a new cgroup is created.
+ * As a consequence, any change that needs to protect against new child cgroups
+ * appearing has to hold it as well.
+ */
+static DEFINE_MUTEX(memcg_create_mutex);
+
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return s ? container_of(s, struct mem_cgroup, css) : NULL;
+}
+
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+ if (!memcg)
+ memcg = root_mem_cgroup;
+ return &memcg->vmpressure;
+}
+
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+ return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return (memcg == root_mem_cgroup);
+}
+
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ return memcg->css.id;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id, &memory_cgrp_subsys);
+ return mem_cgroup_from_css(css);
+}
+
+/* Writing them here to avoid exposing memcg's inner layout */
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+
+void sock_update_memcg(struct sock *sk)
+{
+ if (mem_cgroup_sockets_enabled) {
+ struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
+
+ BUG_ON(!sk->sk_prot->proto_cgroup);
+
+ /* Socket cloning can throw us here with sk_cgrp already
+ * filled. It won't however, necessarily happen from
+ * process context. So the test for root memcg given
+ * the current task's memcg won't help us in this case.
+ *
+ * Respecting the original socket's memcg is a better
+ * decision in this case.
+ */
+ if (sk->sk_cgrp) {
+ BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
+ css_get(&sk->sk_cgrp->memcg->css);
+ return;
+ }
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ cg_proto = sk->sk_prot->proto_cgroup(memcg);
+ if (!mem_cgroup_is_root(memcg) &&
+ memcg_proto_active(cg_proto) &&
+ css_tryget_online(&memcg->css)) {
+ sk->sk_cgrp = cg_proto;
+ }
+ rcu_read_unlock();
+ }
+}
+EXPORT_SYMBOL(sock_update_memcg);
+
+void sock_release_memcg(struct sock *sk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
+ struct mem_cgroup *memcg;
+ WARN_ON(!sk->sk_cgrp->memcg);
+ memcg = sk->sk_cgrp->memcg;
+ css_put(&sk->sk_cgrp->memcg->css);
+ }
+}
+
+struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
+{
+ if (!memcg || mem_cgroup_is_root(memcg))
+ return NULL;
+
+ return &memcg->tcp_mem;
+}
+EXPORT_SYMBOL(tcp_proto_cgroup);
+
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+ if (!memcg_proto_activated(&memcg->tcp_mem))
+ return;
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size. It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
+
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_is_active(memcg)) {
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+ }
+ /*
+ * This check can't live in kmem destruction function,
+ * since the charges will outlive the cgroup
+ */
+ WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+ disarm_sock_keys(memcg);
+ disarm_kmem_keys(memcg);
+}
+
+static void drain_all_stock_async(struct mem_cgroup *memcg);
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
+{
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
+
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
+}
+
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
+{
+ return &memcg->css;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+{
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
+
+ return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
+{
+ struct rb_node **p = &mctz->rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct mem_cgroup_per_zone *mz_node;
+
+ if (mz->on_tree)
+ return;
+
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
+ while (*p) {
+ parent = *p;
+ mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+ tree_node);
+ if (mz->usage_in_excess < mz_node->usage_in_excess)
+ p = &(*p)->rb_left;
+ /*
+ * We can't avoid mem cgroups that are over their soft
+ * limit by the same amount
+ */
+ else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&mz->tree_node, parent, p);
+ rb_insert_color(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = true;
+}
+
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ if (!mz->on_tree)
+ return;
+ rb_erase(&mz->tree_node, &mctz->rb_root);
+ mz->on_tree = false;
+}
+
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
+{
+ spin_lock(&mctz->lock);
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ spin_unlock(&mctz->lock);
+}
+
+
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+ unsigned long long excess;
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_tree_per_zone *mctz;
+
+ mctz = soft_limit_tree_from_page(page);
+ /*
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
+ excess = res_counter_soft_limit_excess(&memcg->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ /*
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
+ */
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ }
+ }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_tree_per_zone *mctz;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ mctz = soft_limit_tree_node_zone(nid, zid);
+ mem_cgroup_remove_exceeded(mz, mctz);
+ }
+ }
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct rb_node *rightmost = NULL;
+ struct mem_cgroup_per_zone *mz;
+
+retry:
+ mz = NULL;
+ rightmost = rb_last(&mctz->rb_root);
+ if (!rightmost)
+ goto done; /* Nothing to reclaim from */
+
+ mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+ /*
+ * Remove the node now but someone else can add it back,
+ * we will to add it back at the end of reclaim to its correct
+ * position in the tree.
+ */
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+ !css_tryget_online(&mz->memcg->css))
+ goto retry;
+done:
+ return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ spin_lock(&mctz->lock);
+ mz = __mem_cgroup_largest_soft_limit_node(mctz);
+ spin_unlock(&mctz->lock);
+ return mz;
+}
+
+/*
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
+static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
+{
+ long val = 0;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ val += per_cpu(memcg->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.count[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
+#endif
+ put_online_cpus();
+ return val;
+}
+
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+ bool charge)
+{
+ int val = (charge) ? 1 : -1;
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+}
+
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx)
+{
+ unsigned long val = 0;
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ val += per_cpu(memcg->stat->events[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&memcg->pcp_counter_lock);
+ val += memcg->nocpu_base.events[idx];
+ spin_unlock(&memcg->pcp_counter_lock);
+#endif
+ put_online_cpus();
+ return val;
+}
+
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
+ struct page *page,
+ bool anon, int nr_pages)
+{
+ /*
+ * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
+ * counted as CACHE even if it's on ANON LRU.
+ */
+ if (anon)
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+ nr_pages);
+ else
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+ nr_pages);
+
+ if (PageTransHuge(page))
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ nr_pages);
+
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+ else {
+ __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+}
+
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ int zid;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ nr += mz->lru_size[lru];
+ }
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
+ return nr;
+}
+
+static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
+{
+ unsigned long val, next;
+
+ val = __this_cpu_read(memcg->stat->nr_page_events);
+ next = __this_cpu_read(memcg->stat->targets[target]);
+ /* from time_after() in jiffies.h */
+ if ((long)next - (long)val < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_NUMAINFO:
+ next = val + NUMAINFO_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->stat->targets[target], next);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+{
+ preempt_disable();
+ /* threshold event is triggered in finer grain than soft limit */
+ if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_THRESH))) {
+ bool do_softlimit;
+ bool do_numainfo __maybe_unused;
+
+ do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+#if MAX_NUMNODES > 1
+ do_numainfo = mem_cgroup_event_ratelimit(memcg,
+ MEM_CGROUP_TARGET_NUMAINFO);
+#endif
+ preempt_enable();
+
+ mem_cgroup_threshold(memcg);
+ if (unlikely(do_softlimit))
+ mem_cgroup_update_tree(memcg, page);
+#if MAX_NUMNODES > 1
+ if (unlikely(do_numainfo))
+ atomic_inc(&memcg->numainfo_events);
+#endif
+ } else
+ preempt_enable();
+}
+
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
+ return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
+}
+
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg = NULL;
+
+ rcu_read_lock();
+ do {
+ /*
+ * Page cache insertions can happen withou an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ memcg = root_mem_cgroup;
+ }
+ } while (!css_tryget_online(&memcg->css));
+ rcu_read_unlock();
+ return memcg;
+}
+
+/*
+ * Returns a next (in a pre-order walk) alive memcg (with elevated css
+ * ref. count) or NULL if the whole root's subtree has been visited.
+ *
+ * helper function to be used by mem_cgroup_iter
+ */
+static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
+ struct mem_cgroup *last_visited)
+{
+ struct cgroup_subsys_state *prev_css, *next_css;
+
+ prev_css = last_visited ? &last_visited->css : NULL;
+skip_node:
+ next_css = css_next_descendant_pre(prev_css, &root->css);
+
+ /*
+ * Even if we found a group we have to make sure it is
+ * alive. css && !memcg means that the groups should be
+ * skipped and we should continue the tree walk.
+ * last_visited css is safe to use because it is
+ * protected by css_get and the tree walk is rcu safe.
+ *
+ * We do not take a reference on the root of the tree walk
+ * because we might race with the root removal when it would
+ * be the only node in the iterated hierarchy and mem_cgroup_iter
+ * would end up in an endless loop because it expects that at
+ * least one valid node will be returned. Root cannot disappear
+ * because caller of the iterator should hold it already so
+ * skipping css reference should be safe.
+ */
+ if (next_css) {
+ if ((next_css == &root->css) ||
+ ((next_css->flags & CSS_ONLINE) &&
+ css_tryget_online(next_css)))
+ return mem_cgroup_from_css(next_css);
+
+ prev_css = next_css;
+ goto skip_node;
+ }
+
+ return NULL;
+}
+
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+ /*
+ * When a group in the hierarchy below root is destroyed, the
+ * hierarchy iterator can no longer be trusted since it might
+ * have pointed to the destroyed group. Invalidate it.
+ */
+ atomic_inc(&root->dead_count);
+}
+
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *root,
+ int *sequence)
+{
+ struct mem_cgroup *position = NULL;
+ /*
+ * A cgroup destruction happens in two stages: offlining and
+ * release. They are separated by a RCU grace period.
+ *
+ * If the iterator is valid, we may still race with an
+ * offlining. The RCU lock ensures the object won't be
+ * released, tryget will fail if we lost the race.
+ */
+ *sequence = atomic_read(&root->dead_count);
+ if (iter->last_dead_count == *sequence) {
+ smp_rmb();
+ position = iter->last_visited;
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget_online(&position->css))
+ position = NULL;
+ }
+ return position;
+}
+
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *last_visited,
+ struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
+ int sequence)
+{
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
+ css_put(&last_visited->css);
+ /*
+ * We store the sequence count from the time @last_visited was
+ * loaded successfully instead of rereading it here so that we
+ * don't lose destruction events in between. We could have
+ * raced with the destruction of @new_position after all.
+ */
+ iter->last_visited = new_position;
+ smp_wmb();
+ iter->last_dead_count = sequence;
+}
+
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Returns references to children of the hierarchy below @root, or
+ * @root itself, or %NULL after a full round-trip.
+ *
+ * Caller must pass the return value in @prev on subsequent
+ * invocations for reference counting, or use mem_cgroup_iter_break()
+ * to cancel a hierarchy walk before the round-trip is complete.
+ *
+ * Reclaimers can specify a zone and a priority level in @reclaim to
+ * divide up the memcgs in the hierarchy among all concurrent
+ * reclaimers operating on the same zone and priority.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+ struct mem_cgroup *prev,
+ struct mem_cgroup_reclaim_cookie *reclaim)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct mem_cgroup *last_visited = NULL;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ if (!root)
+ root = root_mem_cgroup;
+
+ if (prev && !reclaim)
+ last_visited = prev;
+
+ if (!root->use_hierarchy && root != root_mem_cgroup) {
+ if (prev)
+ goto out_css_put;
+ return root;
+ }
+
+ rcu_read_lock();
+ while (!memcg) {
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ int uninitialized_var(seq);
+
+ if (reclaim) {
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ iter = &mz->reclaim_iter[reclaim->priority];
+ if (prev && reclaim->generation != iter->generation) {
+ iter->last_visited = NULL;
+ goto out_unlock;
+ }
+
+ last_visited = mem_cgroup_iter_load(iter, root, &seq);
+ }
+
+ memcg = __mem_cgroup_iter_next(root, last_visited);
+
+ if (reclaim) {
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
+
+ if (!memcg)
+ iter->generation++;
+ else if (!prev && memcg)
+ reclaim->generation = iter->generation;
+ }
+
+ if (prev && !memcg)
+ goto out_unlock;
+ }
+out_unlock:
+ rcu_read_unlock();
+out_css_put:
+ if (prev && prev != root)
+ css_put(&prev->css);
+
+ return memcg;
+}
+
+/**
+ * mem_cgroup_iter_break - abort a hierarchy walk prematurely
+ * @root: hierarchy root
+ * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ */
+void mem_cgroup_iter_break(struct mem_cgroup *root,
+ struct mem_cgroup *prev)
+{
+ if (!root)
+ root = root_mem_cgroup;
+ if (prev && prev != root)
+ css_put(&prev->css);
+}
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree). If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root) \
+ for (iter = mem_cgroup_iter(root, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter) \
+ for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
+ iter != NULL; \
+ iter = mem_cgroup_iter(NULL, iter, NULL))
+
+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ goto out;
+
+ switch (idx) {
+ case PGFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+ break;
+ case PGMAJFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
+
+/**
+ * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
+ * @zone: zone of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for the given @zone and
+ * @mem. This can be the global zone lruvec, if the memory controller
+ * is disabled.
+ */
+struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct lruvec *lruvec;
+
+ if (mem_cgroup_disabled()) {
+ lruvec = &zone->lruvec;
+ goto out;
+ }
+
+ mz = mem_cgroup_zone_zoneinfo(memcg, zone);
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->zone here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->zone != zone))
+ lruvec->zone = zone;
+ return lruvec;
+}
+
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
+
+/**
+ * mem_cgroup_page_lruvec - return lruvec for adding an lru page
+ * @page: the page
+ * @zone: zone of the page
+ */
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+ struct page_cgroup *pc;
+ struct lruvec *lruvec;
+
+ if (mem_cgroup_disabled()) {
+ lruvec = &zone->lruvec;
+ goto out;
+ }
+
+ pc = lookup_page_cgroup(page);
+ memcg = pc->mem_cgroup;
+
+ /*
+ * Surreptitiously switch any uncharged offlist page to root:
+ * an uncharged page off lru does nothing to secure
+ * its former mem_cgroup from sudden removal.
+ *
+ * Our caller holds lru_lock, and PageCgroupUsed is updated
+ * under page_cgroup lock: between them, they make all uses
+ * of pc->mem_cgroup safe.
+ */
+ if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+ pc->mem_cgroup = memcg = root_mem_cgroup;
+
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->zone here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->zone != zone))
+ lruvec->zone = zone;
+ return lruvec;
+}
+
+/**
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
+ * @lruvec: mem_cgroup per zone lru vector
+ * @lru: index of lru list the page is sitting on
+ * @nr_pages: positive when adding or negative when removing
+ *
+ * This function must be called when a page is added to or removed from an
+ * lru list.
+ */
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages)
+{
+ struct mem_cgroup_per_zone *mz;
+ unsigned long *lru_size;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ lru_size = mz->lru_size + lru;
+ *lru_size += nr_pages;
+ VM_BUG_ON((long)(*lru_size) < 0);
+}
+
+/*
+ * Checks whether given mem is same or in the root_mem_cgroup's
+ * hierarchy subtree
+ */
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+ struct mem_cgroup *memcg)
+{
+ if (root_memcg == memcg)
+ return true;
+ if (!root_memcg->use_hierarchy || !memcg)
+ return false;
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
+}
+
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+ struct mem_cgroup *memcg)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+ rcu_read_unlock();
+ return ret;
+}
+
+bool task_in_mem_cgroup(struct task_struct *task,
+ const struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *curr = NULL;
+ struct task_struct *p;
+ bool ret;
+
+ p = find_lock_task_mm(task);
+ if (p) {
+ curr = get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
+ } else {
+ /*
+ * All threads may have already detached their mm's, but the oom
+ * killer still needs to detect if they have already been oom
+ * killed to prevent needlessly killing additional tasks.
+ */
+ rcu_read_lock();
+ curr = mem_cgroup_from_task(task);
+ if (curr)
+ css_get(&curr->css);
+ rcu_read_unlock();
+ }
+ /*
+ * We should check use_hierarchy of "memcg" not "curr". Because checking
+ * use_hierarchy of "curr" here make this function true if hierarchy is
+ * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
+ * hierarchy(even if use_hierarchy is disabled in "memcg").
+ */
+ ret = mem_cgroup_same_or_subtree(memcg, curr);
+ css_put(&curr->css);
+ return ret;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+ unsigned long inactive_ratio;
+ unsigned long inactive;
+ unsigned long active;
+ unsigned long gb;
+
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
+}
+
+#define mem_cgroup_from_res_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
+
+/**
+ * mem_cgroup_margin - calculate chargeable space of a memory cgroup
+ * @memcg: the memory cgroup
+ *
+ * Returns the maximum amount of memory @mem can be charged with, in
+ * pages.
+ */
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
+{
+ unsigned long long margin;
+
+ margin = res_counter_margin(&memcg->res);
+ if (do_swap_account)
+ margin = min(margin, res_counter_margin(&memcg->memsw));
+ return margin >> PAGE_SHIFT;
+}
+
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+ /* root ? */
+ if (mem_cgroup_disabled() || !memcg->css.parent)
+ return vm_swappiness;
+
+ return memcg->swappiness;
+}
+
+/*
+ * memcg->moving_account is used for checking possibility that some thread is
+ * calling move_account(). When a thread on CPU-A starts moving pages under
+ * a memcg, other threads should check memcg->moving_account under
+ * rcu_read_lock(), like this:
+ *
+ * CPU-A CPU-B
+ * rcu_read_lock()
+ * memcg->moving_account+1 if (memcg->mocing_account)
+ * take heavy locks.
+ * synchronize_rcu() update something.
+ * rcu_read_unlock()
+ * start move here.
+ */
+
+/* for quick checking without looking up memcg */
+atomic_t memcg_moving __read_mostly;
+
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
+{
+ atomic_inc(&memcg_moving);
+ atomic_inc(&memcg->moving_account);
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *memcg)
+{
+ /*
+ * Now, mem_cgroup_clear_mc() may call this function with NULL.
+ * We check NULL in callee rather than caller.
+ */
+ if (memcg) {
+ atomic_dec(&memcg_moving);
+ atomic_dec(&memcg->moving_account);
+ }
+}
+
+/*
+ * A routine for checking "mem" is under move_account() or not.
+ *
+ * Checking a cgroup is mc.from or mc.to or under hierarchy of
+ * moving cgroups. This is for waiting at high-memory pressure
+ * caused by "move".
+ */
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+
+ ret = mem_cgroup_same_or_subtree(memcg, from)
+ || mem_cgroup_same_or_subtree(memcg, to);
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(memcg)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Take this lock when
+ * - a code tries to modify page's memcg while it's USED.
+ * - a code tries to modify page state accounting in a memcg.
+ */
+static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
+ unsigned long *flags)
+{
+ spin_lock_irqsave(&memcg->move_lock, *flags);
+}
+
+static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
+ unsigned long *flags)
+{
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
+}
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+/**
+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * @memcg: The memory cgroup that went over limit
+ * @p: Task that is going to be killed
+ *
+ * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
+ * enabled
+ */
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+{
+ /* oom_info_lock ensures that parallel ooms do not interleave */
+ static DEFINE_MUTEX(oom_info_lock);
+ struct mem_cgroup *iter;
+ unsigned int i;
+
+ if (!p)
+ return;
+
+ mutex_lock(&oom_info_lock);
+ rcu_read_lock();
+
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_info(" killed as a result of limit of ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_info("\n");
+
+ rcu_read_unlock();
+
+ pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->res, RES_FAILCNT));
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(iter->css.cgroup);
+ pr_cont(":");
+
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ continue;
+ pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+ K(mem_cgroup_read_stat(iter, i)));
+ }
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+ K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+ pr_cont("\n");
+ }
+ mutex_unlock(&oom_info_lock);
+}
+
+/*
+ * This function returns the number of memcg under hierarchy tree. Returns
+ * 1(self count) if no children.
+ */
+static int mem_cgroup_count_children(struct mem_cgroup *memcg)
+{
+ int num = 0;
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ num++;
+ return num;
+}
+
+/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ u64 limit;
+
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+
+ /*
+ * Do not consider swap space if we cannot swap due to swappiness
+ */
+ if (mem_cgroup_swappiness(memcg)) {
+ u64 memsw;
+
+ limit += total_swap_pages << PAGE_SHIFT;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+
+ /*
+ * If memsw is finite and limits the amount of swap space
+ * available to this memcg, return that limit.
+ */
+ limit = min(limit, memsw);
+ }
+
+ return limit;
+}
+
+static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
+{
+ struct mem_cgroup *iter;
+ unsigned long chosen_points = 0;
+ unsigned long totalpages;
+ unsigned int points = 0;
+ struct task_struct *chosen = NULL;
+
+ /*
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
+ */
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ set_thread_flag(TIF_MEMDIE);
+ return;
+ }
+
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&iter->css, &it);
+ while ((task = css_task_iter_next(&it))) {
+ switch (oom_scan_process_thread(task, totalpages, NULL,
+ false)) {
+ case OOM_SCAN_SELECT:
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = ULONG_MAX;
+ get_task_struct(chosen);
+ /* fall through */
+ case OOM_SCAN_CONTINUE:
+ continue;
+ case OOM_SCAN_ABORT:
+ css_task_iter_end(&it);
+ mem_cgroup_iter_break(memcg, iter);
+ if (chosen)
+ put_task_struct(chosen);
+ return;
+ case OOM_SCAN_OK:
+ break;
+ };
+ points = oom_badness(task, memcg, NULL, totalpages);
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
+ }
+ css_task_iter_end(&it);
+ }
+
+ if (!chosen)
+ return;
+ points = chosen_points * 1000 / totalpages;
+ oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+ NULL, "Memory cgroup out of memory");
+}
+
+static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned long flags)
+{
+ unsigned long total = 0;
+ bool noswap = false;
+ int loop;
+
+ if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
+ noswap = true;
+ if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
+ noswap = true;
+
+ for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+ if (loop)
+ drain_all_stock_async(memcg);
+ total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+ /*
+ * Allow limit shrinkers, which are triggered directly
+ * by userspace, to catch signals and stop reclaim
+ * after minimal progress, regardless of the margin.
+ */
+ if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
+ break;
+ if (mem_cgroup_margin(memcg))
+ break;
+ /*
+ * If nothing was reclaimed after two attempts, there
+ * may be no reclaimable pages in this hierarchy.
+ */
+ if (loop && !total)
+ break;
+ }
+ return total;
+}
+
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @memcg: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
+ int nid, bool noswap)
+{
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ return true;
+ if (noswap || !total_swap_pages)
+ return false;
+ if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ return true;
+ return false;
+
+}
+#if MAX_NUMNODES > 1
+
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
+{
+ int nid;
+ /*
+ * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+ * pagein/pageout changes since the last update.
+ */
+ if (!atomic_read(&memcg->numainfo_events))
+ return;
+ if (atomic_inc_return(&memcg->numainfo_updating) > 1)
+ return;
+
+ /* make a nodemask where this memcg uses memory from */
+ memcg->scan_nodes = node_states[N_MEMORY];
+
+ for_each_node_mask(nid, node_states[N_MEMORY]) {
+
+ if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
+ node_clear(nid, memcg->scan_nodes);
+ }
+
+ atomic_set(&memcg->numainfo_events, 0);
+ atomic_set(&memcg->numainfo_updating, 0);
+}
+
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
+{
+ int node;
+
+ mem_cgroup_may_update_nodemask(memcg);
+ node = memcg->last_scanned_node;
+
+ node = next_node(node, memcg->scan_nodes);
+ if (node == MAX_NUMNODES)
+ node = first_node(memcg->scan_nodes);
+ /*
+ * We call this when we hit limit, not when pages are added to LRU.
+ * No LRU may hold pages because all pages are UNEVICTABLE or
+ * memcg is too small and all pages are not on LRU. In that case,
+ * we use curret node.
+ */
+ if (unlikely(node == MAX_NUMNODES))
+ node = numa_node_id();
+
+ memcg->last_scanned_node = node;
+ return node;
+}
+
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+ int nid;
+
+ /*
+ * quick check...making use of scan_node.
+ * We can skip unused nodes.
+ */
+ if (!nodes_empty(memcg->scan_nodes)) {
+ for (nid = first_node(memcg->scan_nodes);
+ nid < MAX_NUMNODES;
+ nid = next_node(nid, memcg->scan_nodes)) {
+
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ }
+ /*
+ * Check rest of nodes.
+ */
+ for_each_node_state(nid, N_MEMORY) {
+ if (node_isset(nid, memcg->scan_nodes))
+ continue;
+ if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+ return true;
+ }
+ return false;
+}
+
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+ return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
+}
+#endif
+
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+ struct zone *zone,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ struct mem_cgroup *victim = NULL;
+ int total = 0;
+ int loop = 0;
+ unsigned long excess;
+ unsigned long nr_scanned;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = 0,
+ };
+
+ excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+ while (1) {
+ victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+ if (!victim) {
+ loop++;
+ if (loop >= 2) {
+ /*
+ * If we have not been able to reclaim
+ * anything, it might because there are
+ * no reclaimable pages under this hierarchy
+ */
+ if (!total)
+ break;
+ /*
+ * We want to do more targeted reclaim.
+ * excess >> 2 is not to excessive so as to
+ * reclaim too much, nor too less that we keep
+ * coming back to reclaim from this cgroup
+ */
+ if (total >= (excess >> 2) ||
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+ break;
+ }
+ continue;
+ }
+ if (!mem_cgroup_reclaimable(victim, false))
+ continue;
+ total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+ zone, &nr_scanned);
+ *total_scanned += nr_scanned;
+ if (!res_counter_soft_limit_excess(&root_memcg->res))
+ break;
+ }
+ mem_cgroup_iter_break(root_memcg, victim);
+ return total;
+}
+
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter, *failed = NULL;
+
+ spin_lock(&memcg_oom_lock);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter->oom_lock) {
+ /*
+ * this subtree of our hierarchy is already locked
+ * so we cannot give a lock.
+ */
+ failed = iter;
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ } else
+ iter->oom_lock = true;
+ }
+
+ if (failed) {
+ /*
+ * OK, we failed to lock the whole subtree so we have
+ * to clean up what we set up to the failing subtree
+ */
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == failed) {
+ mem_cgroup_iter_break(memcg, iter);
+ break;
+ }
+ iter->oom_lock = false;
+ }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
+
+ spin_unlock(&memcg_oom_lock);
+
+ return !failed;
+}
+
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
+ for_each_mem_cgroup_tree(iter, memcg)
+ iter->oom_lock = false;
+ spin_unlock(&memcg_oom_lock);
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ atomic_inc(&iter->under_oom);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ /*
+ * When a new child is created while the hierarchy is under oom,
+ * mem_cgroup_oom_lock() may not be called. We have to use
+ * atomic_add_unless() here.
+ */
+ for_each_mem_cgroup_tree(iter, memcg)
+ atomic_add_unless(&iter->under_oom, -1, 0);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+struct oom_wait_info {
+ struct mem_cgroup *memcg;
+ wait_queue_t wait;
+};
+
+static int memcg_oom_wake_function(wait_queue_t *wait,
+ unsigned mode, int sync, void *arg)
+{
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
+ struct mem_cgroup *oom_wait_memcg;
+ struct oom_wait_info *oom_wait_info;
+
+ oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+ oom_wait_memcg = oom_wait_info->memcg;
+
+ /*
+ * Both of oom_wait_info->memcg and wake_memcg are stable under us.
+ * Then we can use css_is_ancestor without taking care of RCU.
+ */
+ if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
+ && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, arg);
+}
+
+static void memcg_wakeup_oom(struct mem_cgroup *memcg)
+{
+ atomic_inc(&memcg->oom_wakeups);
+ /* for filtering, pass "memcg" as argument. */
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
+}
+
+static void memcg_oom_recover(struct mem_cgroup *memcg)
+{
+ if (memcg && atomic_read(&memcg->under_oom))
+ memcg_wakeup_oom(memcg);
+}
+
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+{
+ if (!current->memcg_oom.may_oom)
+ return;
+ /*
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * Also, the caller may handle a failed allocation gracefully
+ * (like optional page cache readahead) and so an OOM killer
+ * invocation might not even be necessary.
+ *
+ * That's why we don't do anything here except remember the
+ * OOM context and then deal with it at the end of the page
+ * fault when the stack is unwound, the locks are released,
+ * and when we know whether the fault was overall successful.
+ */
+ css_get(&memcg->css);
+ current->memcg_oom.memcg = memcg;
+ current->memcg_oom.gfp_mask = mask;
+ current->memcg_oom.order = order;
+}
+
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
+ *
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
+ *
+ * Memcg supports userspace OOM handling where failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation. Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to complete the OOM handling.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * completed, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(bool handle)
+{
+ struct mem_cgroup *memcg = current->memcg_oom.memcg;
+ struct oom_wait_info owait;
+ bool locked;
+
+ /* OOM is global, do not handle */
+ if (!memcg)
+ return false;
+
+ if (!handle)
+ goto cleanup;
+
+ owait.memcg = memcg;
+ owait.wait.flags = 0;
+ owait.wait.func = memcg_oom_wake_function;
+ owait.wait.private = current;
+ INIT_LIST_HEAD(&owait.wait.task_list);
+
+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+ current->memcg_oom.order);
+ } else {
+ schedule();
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
+ mem_cgroup_oom_unlock(memcg);
+ /*
+ * There is no guarantee that an OOM-lock contender
+ * sees the wakeups triggered by the OOM kill
+ * uncharges. Wake any sleepers explicitely.
+ */
+ memcg_oom_recover(memcg);
+ }
+cleanup:
+ current->memcg_oom.memcg = NULL;
+ css_put(&memcg->css);
+ return true;
+}
+
+/*
+ * Used to update mapped file or writeback or other statistics.
+ *
+ * Notes: Race condition
+ *
+ * We usually use lock_page_cgroup() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check memcg->moving_account and detect there are possibility
+ * of race or not. If there is, we take a lock.
+ */
+
+void __mem_cgroup_begin_update_page_stat(struct page *page,
+ bool *locked, unsigned long *flags)
+{
+ struct mem_cgroup *memcg;
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+again:
+ memcg = pc->mem_cgroup;
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
+ return;
+ /*
+ * If this memory cgroup is not under account moving, we don't
+ * need to take move_lock_mem_cgroup(). Because we already hold
+ * rcu_read_lock(), any calls to move_account will be delayed until
+ * rcu_read_unlock().
+ */
+ VM_BUG_ON(!rcu_read_lock_held());
+ if (atomic_read(&memcg->moving_account) <= 0)
+ return;
+
+ move_lock_mem_cgroup(memcg, flags);
+ if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
+ move_unlock_mem_cgroup(memcg, flags);
+ goto again;
+ }
+ *locked = true;
+}
+
+void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ /*
+ * It's guaranteed that pc->mem_cgroup never changes while
+ * lock is held because a routine modifies pc->mem_cgroup
+ * should take move_lock_mem_cgroup().
+ */
+ move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+}
+
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ struct mem_cgroup *memcg;
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ unsigned long uninitialized_var(flags);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ VM_BUG_ON(!rcu_read_lock_held());
+ memcg = pc->mem_cgroup;
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
+ return;
+
+ this_cpu_add(memcg->stat->count[idx], val);
+}
+
+/*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define CHARGE_BATCH 32U
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+ struct work_struct work;
+ unsigned long flags;
+#define FLUSHING_CACHED_CHARGE 0
+};
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_MUTEX(percpu_charge_mutex);
+
+/**
+ * consume_stock: Try to consume stocked charge on this cpu.
+ * @memcg: memcg to consume from.
+ * @nr_pages: how many pages to charge.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock. Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
+ */
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ struct memcg_stock_pcp *stock;
+ bool ret = true;
+
+ if (nr_pages > CHARGE_BATCH)
+ return false;
+
+ stock = &get_cpu_var(memcg_stock);
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+ stock->nr_pages -= nr_pages;
+ else /* need to call res_counter_charge */
+ ret = false;
+ put_cpu_var(memcg_stock);
+ return ret;
+}
+
+/*
+ * Returns stocks cached in percpu to res_counter and reset cached information.
+ */
+static void drain_stock(struct memcg_stock_pcp *stock)
+{
+ struct mem_cgroup *old = stock->cached;
+
+ if (stock->nr_pages) {
+ unsigned long bytes = stock->nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&old->res, bytes);
+ if (do_swap_account)
+ res_counter_uncharge(&old->memsw, bytes);
+ stock->nr_pages = 0;
+ }
+ stock->cached = NULL;
+}
+
+/*
+ * This must be called under preempt disabled or must be called by
+ * a thread which is pinned to local cpu.
+ */
+static void drain_local_stock(struct work_struct *dummy)
+{
+ struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
+ drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+}
+
+static void __init memcg_stock_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock =
+ &per_cpu(memcg_stock, cpu);
+ INIT_WORK(&stock->work, drain_local_stock);
+ }
+}
+
+/*
+ * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * This will be consumed by consume_stock() function, later.
+ */
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+
+ if (stock->cached != memcg) { /* reset if necessary */
+ drain_stock(stock);
+ stock->cached = memcg;
+ }
+ stock->nr_pages += nr_pages;
+ put_cpu_var(memcg_stock);
+}
+
+/*
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
+ * of the hierarchy under it. sync flag says whether we should block
+ * until the work is done.
+ */
+static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
+{
+ int cpu, curcpu;
+
+ /* Notify other cpus that system-wide "drain" is running */
+ get_online_cpus();
+ curcpu = get_cpu();
+ for_each_online_cpu(cpu) {
+ struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+ struct mem_cgroup *memcg;
+
+ memcg = stock->cached;
+ if (!memcg || !stock->nr_pages)
+ continue;
+ if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
+ continue;
+ if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (cpu == curcpu)
+ drain_local_stock(&stock->work);
+ else
+ schedule_work_on(cpu, &stock->work);
+ }
+ }
+ put_cpu();
+
+ if (!sync)
+ goto out;
+
+ for_each_online_cpu(cpu) {
+ struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+ if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+ flush_work(&stock->work);
+ }
+out:
+ put_online_cpus();
+}
+
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(struct mem_cgroup *root_memcg)
+{
+ /*
+ * If someone calls draining, avoid adding more kworker runs.
+ */
+ if (!mutex_trylock(&percpu_charge_mutex))
+ return;
+ drain_all_stock(root_memcg, false);
+ mutex_unlock(&percpu_charge_mutex);
+}
+
+/* This is a synchronous drain interface. */
+static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
+{
+ /* called when force_empty is called */
+ mutex_lock(&percpu_charge_mutex);
+ drain_all_stock(root_memcg, true);
+ mutex_unlock(&percpu_charge_mutex);
+}
+
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
+{
+ int i;
+
+ spin_lock(&memcg->pcp_counter_lock);
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ long x = per_cpu(memcg->stat->count[i], cpu);
+
+ per_cpu(memcg->stat->count[i], cpu) = 0;
+ memcg->nocpu_base.count[i] += x;
+ }
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long x = per_cpu(memcg->stat->events[i], cpu);
+
+ per_cpu(memcg->stat->events[i], cpu) = 0;
+ memcg->nocpu_base.events[i] += x;
+ }
+ spin_unlock(&memcg->pcp_counter_lock);
+}
+
+static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ struct memcg_stock_pcp *stock;
+ struct mem_cgroup *iter;
+
+ if (action == CPU_ONLINE)
+ return NOTIFY_OK;
+
+ if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ for_each_mem_cgroup(iter)
+ mem_cgroup_drain_pcp_counter(iter, cpu);
+
+ stock = &per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
+ return NOTIFY_OK;
+}
+
+
+/* See mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+};
+
+static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ unsigned int nr_pages, unsigned int min_pages,
+ bool invoke_oom)
+{
+ unsigned long csize = nr_pages * PAGE_SIZE;
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&memcg->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ res_counter_uncharge(&memcg->res, csize);
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ /*
+ * Never reclaim on behalf of optional batching, retry with a
+ * single page instead.
+ */
+ if (nr_pages > min_pages)
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ if (gfp_mask & __GFP_NORETRY)
+ return CHARGE_NOMEM;
+
+ ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+ if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
+ return CHARGE_RETRY;
+ /*
+ * Even though the limit is exceeded at this point, reclaim
+ * may have been able to free some pages. Retry the charge
+ * before killing the task.
+ *
+ * Only for regular pages, though: huge pages are rather
+ * unlikely to succeed so close to the limit, and we fall back
+ * to regular pages anyway in case of failure.
+ */
+ if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ if (invoke_oom)
+ mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
+
+ return CHARGE_NOMEM;
+}
+
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
+ */
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+{
+ unsigned int batch = max(CHARGE_BATCH, nr_pages);
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int ret;
+
+ if (mem_cgroup_is_root(memcg))
+ goto done;
+ /*
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current) ||
+ current->flags & PF_EXITING))
+ goto bypass;
+
+ if (unlikely(task_in_memcg_oom(current)))
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
+again:
+ if (consume_stock(memcg, nr_pages))
+ goto done;
+
+ do {
+ bool invoke_oom = oom && !nr_oom_retries;
+
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current))
+ goto bypass;
+
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+ nr_pages, invoke_oom);
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ batch = nr_pages;
+ goto again;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+ if (!oom || invoke_oom)
+ goto nomem;
+ nr_oom_retries--;
+ break;
+ }
+ } while (ret != CHARGE_OK);
+
+ if (batch > nr_pages)
+ refill_stock(memcg, batch - nr_pages);
+done:
+ return 0;
+nomem:
+ if (!(gfp_mask & __GFP_NOFAIL))
+ return -ENOMEM;
+bypass:
+ return -EINTR;
+}
+
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ memcg = NULL;
+
+ return memcg;
+}
+
+/*
+ * Somemtimes we have to undo a charge we got by try_charge().
+ * This function is for that and do uncharge, put css's refcnt.
+ * gotten by try_charge().
+ */
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!mem_cgroup_is_root(memcg)) {
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&memcg->res, bytes);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, bytes);
+ }
+}
+
+/*
+ * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
+ * This is useful when moving usage to parent cgroup.
+ */
+static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ if (mem_cgroup_is_root(memcg))
+ return;
+
+ res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+ if (do_swap_account)
+ res_counter_uncharge_until(&memcg->memsw,
+ memcg->memsw.parent, bytes);
+}
+
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
+static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
+{
+ /* ID 0 is unused ID */
+ if (!id)
+ return NULL;
+ return mem_cgroup_from_id(id);
+}
+
+struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+ unsigned short id;
+ swp_entry_t ent;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ } else if (PageSwapCache(page)) {
+ ent.val = page_private(page);
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
+ }
+ unlock_page_cgroup(pc);
+ return memcg;
+}
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
+ struct page *page,
+ unsigned int nr_pages,
+ enum charge_type ctype,
+ bool lrucare)
+{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ struct zone *uninitialized_var(zone);
+ struct lruvec *lruvec;
+ bool was_on_lru = false;
+ bool anon;
+
+ lock_page_cgroup(pc);
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
+
+ /*
+ * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
+ * may already be on some other mem_cgroup's LRU. Take care of it.
+ */
+ if (lrucare) {
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ was_on_lru = true;
+ }
+ }
+
+ pc->mem_cgroup = memcg;
+ /*
+ * We access a page_cgroup asynchronously without lock_page_cgroup().
+ * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
+ * is accessed after testing USED bit. To make pc->mem_cgroup visible
+ * before USED bit, we need memory barrier here.
+ * See mem_cgroup_add_lru_list(), etc.
+ */
+ smp_wmb();
+ SetPageCgroupUsed(pc);
+
+ if (lrucare) {
+ if (was_on_lru) {
+ lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, page_lru(page));
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
+ anon = true;
+ else
+ anon = false;
+
+ mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
+ unlock_page_cgroup(pc);
+
+ /*
+ * "charge_statistics" updated event counter. Then, check it.
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
+ */
+ memcg_check_events(memcg, page);
+}
+
+static DEFINE_MUTEX(set_limit_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+ * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
+ */
+static DEFINE_MUTEX(memcg_slab_mutex);
+
+static DEFINE_MUTEX(activate_kmem_mutex);
+
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+ memcg_kmem_is_active(memcg);
+}
+
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+ struct kmem_cache *cachep;
+
+ VM_BUG_ON(p->is_root_cache);
+ cachep = p->root_cache;
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
+}
+
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct memcg_cache_params *params;
+
+ if (!memcg_can_account_kmem(memcg))
+ return -EIO;
+
+ print_slabinfo_header(m);
+
+ mutex_lock(&memcg_slab_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+ cache_show(memcg_params_to_cache(params), m);
+ mutex_unlock(&memcg_slab_mutex);
+
+ return 0;
+}
+#endif
+
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+ struct res_counter *fail_res;
+ int ret = 0;
+
+ ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+ if (ret)
+ return ret;
+
+ ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+ oom_gfp_allowed(gfp));
+ if (ret == -EINTR) {
+ /*
+ * mem_cgroup_try_charge() chosed to bypass to root due to
+ * OOM kill or fatal signal. Since our only options are to
+ * either fail the allocation or charge it to this cgroup, do
+ * it as a temporary condition. But we can't fail. From a
+ * kmem/slab perspective, the cache has already been selected,
+ * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ * our minds.
+ *
+ * This condition will only trigger if the task entered
+ * memcg_charge_kmem in a sane state, but was OOM-killed during
+ * mem_cgroup_try_charge() above. Tasks that were already
+ * dying when the allocation triggers should have been already
+ * directed to the root cgroup in memcontrol.h
+ */
+ res_counter_charge_nofail(&memcg->res, size, &fail_res);
+ if (do_swap_account)
+ res_counter_charge_nofail(&memcg->memsw, size,
+ &fail_res);
+ ret = 0;
+ } else if (ret)
+ res_counter_uncharge(&memcg->kmem, size);
+
+ return ret;
+}
+
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+ res_counter_uncharge(&memcg->res, size);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, size);
+
+ /* Not down to 0 */
+ if (res_counter_uncharge(&memcg->kmem, size))
+ return;
+
+ /*
+ * Releases a reference taken in kmem_cgroup_css_offline in case
+ * this last uncharge is racing with the offlining code or it is
+ * outliving the memcg existence.
+ *
+ * The memory barrier imposed by test&clear is paired with the
+ * explicit one in memcg_kmem_mark_dead().
+ */
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ css_put(&memcg->css);
+}
+
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
+
+static size_t memcg_caches_array_size(int num_groups)
+{
+ ssize_t size;
+ if (num_groups <= 0)
+ return 0;
+
+ size = 2 * num_groups;
+ if (size < MEMCG_CACHES_MIN_SIZE)
+ size = MEMCG_CACHES_MIN_SIZE;
+ else if (size > MEMCG_CACHES_MAX_SIZE)
+ size = MEMCG_CACHES_MAX_SIZE;
+
+ return size;
+}
+
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+ if (num > memcg_limited_groups_array_size)
+ memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+ struct memcg_cache_params *cur_params = s->memcg_params;
+
+ VM_BUG_ON(!is_root_cache(s));
+
+ if (num_groups > memcg_limited_groups_array_size) {
+ int i;
+ struct memcg_cache_params *new_params;
+ ssize_t size = memcg_caches_array_size(num_groups);
+
+ size *= sizeof(void *);
+ size += offsetof(struct memcg_cache_params, memcg_caches);
+
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
+ return -ENOMEM;
+
+ new_params->is_root_cache = true;
+
+ /*
+ * There is the chance it will be bigger than
+ * memcg_limited_groups_array_size, if we failed an allocation
+ * in a cache, in which case all caches updated before it, will
+ * have a bigger array.
+ *
+ * But if that is the case, the data after
+ * memcg_limited_groups_array_size is certainly unused
+ */
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ if (!cur_params->memcg_caches[i])
+ continue;
+ new_params->memcg_caches[i] =
+ cur_params->memcg_caches[i];
+ }
+
+ /*
+ * Ideally, we would wait until all caches succeed, and only
+ * then free the old one. But this is not worth the extra
+ * pointer per-cache we'd have to have for this.
+ *
+ * It is not a big deal if some caches are left with a size
+ * bigger than the others. And all updates will reset this
+ * anyway.
+ */
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
+ }
+ return 0;
+}
+
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
+{
+ size_t size;
+
+ if (!memcg_kmem_enabled())
+ return 0;
+
+ if (!memcg) {
+ size = offsetof(struct memcg_cache_params, memcg_caches);
+ size += memcg_limited_groups_array_size * sizeof(void *);
+ } else
+ size = sizeof(struct memcg_cache_params);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params)
+ return -ENOMEM;
+
+ if (memcg) {
+ s->memcg_params->memcg = memcg;
+ s->memcg_params->root_cache = root_cache;
+ css_get(&memcg->css);
+ } else
+ s->memcg_params->is_root_cache = true;
+
+ return 0;
+}
+
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ css_put(&s->memcg_params->memcg->css);
+ kfree(s->memcg_params);
+}
+
+static void memcg_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+ static char memcg_name_buf[NAME_MAX + 1]; /* protected by
+ memcg_slab_mutex */
+ struct kmem_cache *cachep;
+ int id;
+
+ lockdep_assert_held(&memcg_slab_mutex);
+
+ id = memcg_cache_id(memcg);
+
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (cache_from_memcg_idx(root_cache, id))
+ return;
+
+ cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
+ cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+ /*
+ * If we could not create a memcg cache, do not complain, because
+ * that's not critical at all as we can always proceed with the root
+ * cache.
+ */
+ if (!cachep)
+ return;
+
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+
+ BUG_ON(root_cache->memcg_params->memcg_caches[id]);
+ root_cache->memcg_params->memcg_caches[id] = cachep;
+}
+
+static void memcg_unregister_cache(struct kmem_cache *cachep)
+{
+ struct kmem_cache *root_cache;
+ struct mem_cgroup *memcg;
+ int id;
+
+ lockdep_assert_held(&memcg_slab_mutex);
+
+ BUG_ON(is_root_cache(cachep));
+
+ root_cache = cachep->memcg_params->root_cache;
+ memcg = cachep->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
+
+ BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ root_cache->memcg_params->memcg_caches[id] = NULL;
+
+ list_del(&cachep->memcg_params->list);
+
+ kmem_cache_destroy(cachep);
+}
+
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account--;
+}
+
+int __memcg_cleanup_cache_params(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+ int i, failed = 0;
+
+ mutex_lock(&memcg_slab_mutex);
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+
+ memcg_unregister_cache(c);
+
+ if (cache_from_memcg_idx(s, i))
+ failed++;
+ }
+ mutex_unlock(&memcg_slab_mutex);
+ return failed;
+}
+
+static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params, *tmp;
+
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ mutex_lock(&memcg_slab_mutex);
+ list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
+ cachep = memcg_params_to_cache(params);
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ memcg_unregister_cache(cachep);
+ }
+ mutex_unlock(&memcg_slab_mutex);
+}
+
+struct memcg_register_cache_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
+static void memcg_register_cache_func(struct work_struct *w)
+{
+ struct memcg_register_cache_work *cw =
+ container_of(w, struct memcg_register_cache_work, work);
+ struct mem_cgroup *memcg = cw->memcg;
+ struct kmem_cache *cachep = cw->cachep;
+
+ mutex_lock(&memcg_slab_mutex);
+ memcg_register_cache(memcg, cachep);
+ mutex_unlock(&memcg_slab_mutex);
+
+ css_put(&memcg->css);
+ kfree(cw);
+}
+
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ */
+static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct memcg_register_cache_work *cw;
+
+ cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+ if (cw == NULL) {
+ css_put(&memcg->css);
+ return;
+ }
+
+ cw->memcg = memcg;
+ cw->cachep = cachep;
+
+ INIT_WORK(&cw->work, memcg_register_cache_func);
+ schedule_work(&cw->work);
+}
+
+static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ /*
+ * We need to stop accounting when we kmalloc, because if the
+ * corresponding kmalloc cache is not yet created, the first allocation
+ * in __memcg_schedule_register_cache will recurse.
+ *
+ * However, it is better to enclose the whole function. Depending on
+ * the debugging options enabled, INIT_WORK(), for instance, can
+ * trigger an allocation. This too, will make us recurse. Because at
+ * this point we can't allow ourselves back into memcg_kmem_get_cache,
+ * the safest choice is to do it like this, wrapping the whole function.
+ */
+ memcg_stop_kmem_account();
+ __memcg_schedule_register_cache(memcg, cachep);
+ memcg_resume_kmem_account();
+}
+
+int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
+{
+ int res;
+
+ res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
+ PAGE_SIZE << order);
+ if (!res)
+ atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+ return res;
+}
+
+void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
+{
+ memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
+ atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+}
+
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+ gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ struct kmem_cache *memcg_cachep;
+
+ VM_BUG_ON(!cachep->memcg_params);
+ VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return cachep;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+
+ if (!memcg_can_account_kmem(memcg))
+ goto out;
+
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
+ goto out;
+ }
+
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget_online(&memcg->css))
+ goto out;
+ rcu_read_unlock();
+
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * memcg_create_kmem_cache, this means no further allocation
+ * could happen with the slab_mutex held. So it's better to
+ * defer everything.
+ */
+ memcg_schedule_register_cache(memcg, cachep);
+ return cachep;
+out:
+ rcu_read_unlock();
+ return cachep;
+}
+
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer. We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ *_memcg = NULL;
+
+ /*
+ * Disabling accounting is only relevant for some specific memcg
+ * internal allocations. Therefore we would initially not have such
+ * check here, since direct calls to the page allocator that are
+ * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+ * outside memcg core. We are mostly concerned with cache allocations,
+ * and by having this test at memcg_kmem_get_cache, we are already able
+ * to relay the allocation to the root cache and bypass the memcg cache
+ * altogether.
+ *
+ * There is one exception, though: the SLUB allocator does not create
+ * large order caches, but rather service large kmallocs directly from
+ * the page allocator. Therefore, the following sequence when backed by
+ * the SLUB allocator:
+ *
+ * memcg_stop_kmem_account();
+ * kmalloc(<large_number>)
+ * memcg_resume_kmem_account();
+ *
+ * would effectively ignore the fact that we should skip accounting,
+ * since it will drive us directly to this function without passing
+ * through the cache selector memcg_kmem_get_cache. Such large
+ * allocations are extremely rare but can happen, for instance, for the
+ * cache arrays. We bring this test here.
+ */
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return true;
+
+ memcg = get_mem_cgroup_from_mm(current->mm);
+
+ if (!memcg_can_account_kmem(memcg)) {
+ css_put(&memcg->css);
+ return true;
+ }
+
+ ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+ if (!ret)
+ *_memcg = memcg;
+
+ css_put(&memcg->css);
+ return (ret == 0);
+}
+
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ int order)
+{
+ struct page_cgroup *pc;
+
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
+
+ /* The page allocation failed. Revert */
+ if (!page) {
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+ return;
+ }
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ pc->mem_cgroup = memcg;
+ SetPageCgroupUsed(pc);
+ unlock_page_cgroup(pc);
+}
+
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Fast unlocked return. Theoretically might have changed, have to
+ * check again after locking.
+ */
+ if (!PageCgroupUsed(pc))
+ return;
+
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ ClearPageCgroupUsed(pc);
+ }
+ unlock_page_cgroup(pc);
+
+ /*
+ * We trust that only if there is a memcg associated with the page, it
+ * is a valid allocation
+ */
+ if (!memcg)
+ return;
+
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compound_lock.
+ * charge/uncharge will be never happen and move_account() is done under
+ * compound_lock(), so we don't have to take care of races.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head)
+{
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
+ struct page_cgroup *pc;
+ struct mem_cgroup *memcg;
+ int i;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = head_pc->mem_cgroup;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ pc = head_pc + i;
+ pc->mem_cgroup = memcg;
+ smp_wmb();/* see __commit_charge() */
+ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ }
+ __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ HPAGE_PMD_NR);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @pc: page_cgroup of the page.
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct page_cgroup *pc,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+ bool anon = PageAnon(page);
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ lock_page_cgroup(pc);
+
+ ret = -EINVAL;
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+ goto unlock;
+
+ move_lock_mem_cgroup(from, &flags);
+
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+
+ /* caller should have done css_get */
+ pc->mem_cgroup = to;
+ mem_cgroup_charge_statistics(to, page, anon, nr_pages);
+ move_unlock_mem_cgroup(from, &flags);
+ ret = 0;
+unlock:
+ unlock_page_cgroup(pc);
+ /*
+ * check events
+ */
+ memcg_check_events(to, page);
+ memcg_check_events(from, page);
+out:
+ return ret;
+}
+
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
+ */
+static int mem_cgroup_move_parent(struct page *page,
+ struct page_cgroup *pc,
+ struct mem_cgroup *child)
+{
+ struct mem_cgroup *parent;
+ unsigned int nr_pages;
+ unsigned long uninitialized_var(flags);
+ int ret;
+
+ VM_BUG_ON(mem_cgroup_is_root(child));
+
+ ret = -EBUSY;
+ if (!get_page_unless_zero(page))
+ goto out;
+ if (isolate_lru_page(page))
+ goto put;
+
+ nr_pages = hpage_nr_pages(page);
+
+ parent = parent_mem_cgroup(child);
+ /*
+ * If no parent, move charges to root cgroup.
+ */
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ if (nr_pages > 1) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ flags = compound_lock_irqsave(page);
+ }
+
+ ret = mem_cgroup_move_account(page, nr_pages,
+ pc, child, parent);
+ if (!ret)
+ __mem_cgroup_cancel_local_charge(child, nr_pages);
+
+ if (nr_pages > 1)
+ compound_unlock_irqrestore(page, flags);
+ putback_lru_page(page);
+put:
+ put_page(page);
+out:
+ return ret;
+}
+
+int mem_cgroup_charge_anon(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
+{
+ unsigned int nr_pages = 1;
+ struct mem_cgroup *memcg;
+ bool oom = true;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ VM_BUG_ON(!mm);
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /*
+ * Never OOM-kill a process for a huge page. The
+ * fault handler will fall back to regular pages.
+ */
+ oom = false;
+ }
+
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages,
+ MEM_CGROUP_CHARGE_TYPE_ANON, false);
+ return 0;
+}
+
+/*
+ * While swap-in, try_charge -> commit or cancel, the page is locked.
+ * And when try_charge() successfully returns, one refcnt to memcg without
+ * struct page_cgroup is acquired. This refcnt will be consumed by
+ * "commit()" or removed by "cancel()"
+ */
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+ struct page *page,
+ gfp_t mask,
+ struct mem_cgroup **memcgp)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+ int ret;
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Every swap fault against a single page tries to charge the
+ * page, bail as early as possible. shmem_unuse() encounters
+ * already charged pages, too. The USED bit is protected by
+ * the page lock, which serializes swap cache removal, which
+ * in turn serializes uncharging.
+ */
+ if (PageCgroupUsed(pc))
+ goto out;
+ if (do_swap_account)
+ memcg = try_get_mem_cgroup_from_page(page);
+ if (!memcg)
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, mask, 1, true);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ return ret;
+out:
+ *memcgp = memcg;
+ return 0;
+}
+
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+ if (mem_cgroup_disabled()) {
+ *memcgp = NULL;
+ return 0;
+ }
+ /*
+ * A racing thread's fault, or swapoff, may have already
+ * updated the pte, and even removed page from swap cache: in
+ * those cases unuse_pte()'s pte_same() test will fail; but
+ * there's also a KSM case which does need to charge the page.
+ */
+ if (!PageSwapCache(page)) {
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ *memcgp = memcg;
+ return 0;
+ }
+ return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return;
+ if (!memcg)
+ return;
+ __mem_cgroup_cancel_charge(memcg, 1);
+}
+
+static void
+__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
+ enum charge_type ctype)
+{
+ if (mem_cgroup_disabled())
+ return;
+ if (!memcg)
+ return;
+
+ __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
+ /*
+ * Now swap is on-memory. This means this page may be
+ * counted both as mem and swap....double count.
+ * Fix it by uncharging from memsw. Basically, this SwapCache is stable
+ * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
+ * may call delete_from_swap_cache() before reach here.
+ */
+ if (do_swap_account && PageSwapCache(page)) {
+ swp_entry_t ent = {.val = page_private(page)};
+ mem_cgroup_uncharge_swap(ent);
+ }
+}
+
+void mem_cgroup_commit_charge_swapin(struct page *page,
+ struct mem_cgroup *memcg)
+{
+ __mem_cgroup_commit_charge_swapin(page, memcg,
+ MEM_CGROUP_CHARGE_TYPE_ANON);
+}
+
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ struct mem_cgroup *memcg;
+ int ret;
+
+ if (mem_cgroup_disabled())
+ return 0;
+ if (PageCompound(page))
+ return 0;
+
+ if (PageSwapCache(page)) { /* shmem */
+ ret = __mem_cgroup_try_charge_swapin(mm, page,
+ gfp_mask, &memcg);
+ if (ret)
+ return ret;
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ return 0;
+ }
+
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+ return 0;
+}
+
+static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ const enum charge_type ctype)
+{
+ struct memcg_batch_info *batch = NULL;
+ bool uncharge_memsw = true;
+
+ /* If swapout, usage of swap doesn't decrease */
+ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+ uncharge_memsw = false;
+
+ batch = &current->memcg_batch;
+ /*
+ * In usual, we do css_get() when we remember memcg pointer.
+ * But in this case, we keep res->usage until end of a series of
+ * uncharges. Then, it's ok to ignore memcg's refcnt.
+ */
+ if (!batch->memcg)
+ batch->memcg = memcg;
+ /*
+ * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+ * In those cases, all pages freed continuously can be expected to be in
+ * the same cgroup and we have chance to coalesce uncharges.
+ * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+ * because we want to do uncharge as soon as possible.
+ */
+
+ if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
+ goto direct_uncharge;
+
+ if (nr_pages > 1)
+ goto direct_uncharge;
+
+ /*
+ * In typical case, batch->memcg == mem. This means we can
+ * merge a series of uncharges to an uncharge of res_counter.
+ * If not, we uncharge res_counter ony by one.
+ */
+ if (batch->memcg != memcg)
+ goto direct_uncharge;
+ /* remember freed charge and uncharge it later */
+ batch->nr_pages++;
+ if (uncharge_memsw)
+ batch->memsw_nr_pages++;
+ return;
+direct_uncharge:
+ res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
+ if (uncharge_memsw)
+ res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
+ if (unlikely(batch->memcg != memcg))
+ memcg_oom_recover(memcg);
+}
+
+/*
+ * uncharge if !page_mapped(page)
+ */
+static struct mem_cgroup *
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+ bool end_migration)
+{
+ struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
+ bool anon;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ }
+ /*
+ * Check if our page_cgroup is valid
+ */
+ pc = lookup_page_cgroup(page);
+ if (unlikely(!PageCgroupUsed(pc)))
+ return NULL;
+
+ lock_page_cgroup(pc);
+
+ memcg = pc->mem_cgroup;
+
+ if (!PageCgroupUsed(pc))
+ goto unlock_out;
+
+ anon = PageAnon(page);
+
+ switch (ctype) {
+ case MEM_CGROUP_CHARGE_TYPE_ANON:
+ /*
+ * Generally PageAnon tells if it's the anon statistics to be
+ * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
+ * used before page reached the stage of being marked PageAnon.
+ */
+ anon = true;
+ /* fallthrough */
+ case MEM_CGROUP_CHARGE_TYPE_DROP:
+ /* See mem_cgroup_prepare_migration() */
+ if (page_mapped(page))
+ goto unlock_out;
+ /*
+ * Pages under migration may not be uncharged. But
+ * end_migration() /must/ be the one uncharging the
+ * unused post-migration page and so it has to call
+ * here with the migration bit still set. See the
+ * res_counter handling below.
+ */
+ if (!end_migration && PageCgroupMigration(pc))
+ goto unlock_out;
+ break;
+ case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
+ if (!PageAnon(page)) { /* Shared memory */
+ if (page->mapping && !page_is_file_cache(page))
+ goto unlock_out;
+ } else if (page_mapped(page)) /* Anon */
+ goto unlock_out;
+ break;
+ default:
+ break;
+ }
+
+ mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
+
+ ClearPageCgroupUsed(pc);
+ /*
+ * pc->mem_cgroup is not cleared here. It will be accessed when it's
+ * freed from LRU. This is safe because uncharged page is expected not
+ * to be reused (freed soon). Exception is SwapCache, it's handled by
+ * special functions.
+ */
+
+ unlock_page_cgroup(pc);
+ /*
+ * even after unlock, we have memcg->res.usage here and this memcg
+ * will never be freed, so it's safe to call css_get().
+ */
+ memcg_check_events(memcg, page);
+ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+ mem_cgroup_swap_statistics(memcg, true);
+ css_get(&memcg->css);
+ }
+ /*
+ * Migration does not charge the res_counter for the
+ * replacement page, so leave it alone when phasing out the
+ * page that is unused after the migration.
+ */
+ if (!end_migration && !mem_cgroup_is_root(memcg))
+ mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
+
+ return memcg;
+
+unlock_out:
+ unlock_page_cgroup(pc);
+ return NULL;
+}
+
+void mem_cgroup_uncharge_page(struct page *page)
+{
+ /* early check. */
+ if (page_mapped(page))
+ return;
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ /*
+ * If the page is in swap cache, uncharge should be deferred
+ * to the swap path, which also properly accounts swap usage
+ * and handles memcg lifetime.
+ *
+ * Note that this check is not stable and reclaim may add the
+ * page to swap cache at any time after this. However, if the
+ * page is not in swap cache by the time page->mapcount hits
+ * 0, there won't be any page table references to the swap
+ * slot, and reclaim will free it and not actually write the
+ * page to disk.
+ */
+ if (PageSwapCache(page))
+ return;
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
+}
+
+void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
+}
+
+/*
+ * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ * This may be called prural(2) times in a context,
+ */
+
+void mem_cgroup_uncharge_start(void)
+{
+ current->memcg_batch.do_batch++;
+ /* We can do nest. */
+ if (current->memcg_batch.do_batch == 1) {
+ current->memcg_batch.memcg = NULL;
+ current->memcg_batch.nr_pages = 0;
+ current->memcg_batch.memsw_nr_pages = 0;
+ }
+}
+
+void mem_cgroup_uncharge_end(void)
+{
+ struct memcg_batch_info *batch = &current->memcg_batch;
+
+ if (!batch->do_batch)
+ return;
+
+ batch->do_batch--;
+ if (batch->do_batch) /* If stacked, do nothing. */
+ return;
+
+ if (!batch->memcg)
+ return;
+ /*
+ * This "batch->memcg" is valid without any css_get/put etc...
+ * bacause we hide charges behind us.
+ */
+ if (batch->nr_pages)
+ res_counter_uncharge(&batch->memcg->res,
+ batch->nr_pages * PAGE_SIZE);
+ if (batch->memsw_nr_pages)
+ res_counter_uncharge(&batch->memcg->memsw,
+ batch->memsw_nr_pages * PAGE_SIZE);
+ memcg_oom_recover(batch->memcg);
+ /* forget this pointer (for sanity check) */
+ batch->memcg = NULL;
+}
+
+#ifdef CONFIG_SWAP
+/*
+ * called after __delete_from_swap_cache() and drop "page" account.
+ * memcg information is recorded to swap_cgroup of "ent"
+ */
+void
+mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
+{
+ struct mem_cgroup *memcg;
+ int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
+
+ if (!swapout) /* this was a swap cache but the swap is unused ! */
+ ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
+
+ memcg = __mem_cgroup_uncharge_common(page, ctype, false);
+
+ /*
+ * record memcg information, if swapout && memcg != NULL,
+ * css_get() was called in uncharge().
+ */
+ if (do_swap_account && swapout && memcg)
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
+}
+#endif
+
+#ifdef CONFIG_MEMCG_SWAP
+/*
+ * called from swap_entry_free(). remove record in swap_cgroup and
+ * uncharge "memsw" account.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t ent)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(ent, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ /*
+ * We uncharge this because swap is freed. This memcg can
+ * be obsolete one. We avoid calling css_tryget_online().
+ */
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from: mem_cgroup which the entry is moved from
+ * @to: mem_cgroup which the entry is moved to
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ unsigned short old_id, new_id;
+
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
+
+ if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+ mem_cgroup_swap_statistics(from, false);
+ mem_cgroup_swap_statistics(to, true);
+ /*
+ * This function is only called from task migration context now.
+ * It postpones res_counter and refcount handling till the end
+ * of task migration(mem_cgroup_clear_mc()) for performance
+ * improvement. But we cannot postpone css_get(to) because if
+ * the process that has been moved to @to does swap-in, the
+ * refcount of @to might be decreased to 0.
+ *
+ * We are in attach() phase, so the cgroup is guaranteed to be
+ * alive, so we can just call css_get().
+ */
+ css_get(&to->css);
+ return 0;
+ }
+ return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
+ * page belongs to.
+ */
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
+ struct mem_cgroup **memcgp)
+{
+ struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
+ enum charge_type ctype;
+
+ *memcgp = NULL;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ css_get(&memcg->css);
+ /*
+ * At migrating an anonymous page, its mapcount goes down
+ * to 0 and uncharge() will be called. But, even if it's fully
+ * unmapped, migration may fail and this page has to be
+ * charged again. We set MIGRATION flag here and delay uncharge
+ * until end_migration() is called
+ *
+ * Corner Case Thinking
+ * A)
+ * When the old page was mapped as Anon and it's unmap-and-freed
+ * while migration was ongoing.
+ * If unmap finds the old page, uncharge() of it will be delayed
+ * until end_migration(). If unmap finds a new page, it's
+ * uncharged when it make mapcount to be 1->0. If unmap code
+ * finds swap_migration_entry, the new page will not be mapped
+ * and end_migration() will find it(mapcount==0).
+ *
+ * B)
+ * When the old page was mapped but migraion fails, the kernel
+ * remaps it. A charge for it is kept by MIGRATION flag even
+ * if mapcount goes down to 0. We can do remap successfully
+ * without charging it again.
+ *
+ * C)
+ * The "old" page is under lock_page() until the end of
+ * migration, so, the old page itself will not be swapped-out.
+ * If the new page is swapped out before end_migraton, our
+ * hook to usual swap-out path will catch the event.
+ */
+ if (PageAnon(page))
+ SetPageCgroupMigration(pc);
+ }
+ unlock_page_cgroup(pc);
+ /*
+ * If the page is not charged at this point,
+ * we return here.
+ */
+ if (!memcg)
+ return;
+
+ *memcgp = memcg;
+ /*
+ * We charge new page before it's used/mapped. So, even if unlock_page()
+ * is called before end_migration, we can catch all events on this new
+ * page. In the case new page is migrated but not remapped, new page's
+ * mapcount will be finally 0 and we call uncharge in end_migration().
+ */
+ if (PageAnon(page))
+ ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
+ else
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ /*
+ * The page is committed to the memcg, but it's not actually
+ * charged to the res_counter since we plan on replacing the
+ * old one and only one page is going to be left afterwards.
+ */
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
+}
+
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct mem_cgroup *memcg,
+ struct page *oldpage, struct page *newpage, bool migration_ok)
+{
+ struct page *used, *unused;
+ struct page_cgroup *pc;
+ bool anon;
+
+ if (!memcg)
+ return;
+
+ if (!migration_ok) {
+ used = oldpage;
+ unused = newpage;
+ } else {
+ used = newpage;
+ unused = oldpage;
+ }
+ anon = PageAnon(used);
+ __mem_cgroup_uncharge_common(unused,
+ anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+ : MEM_CGROUP_CHARGE_TYPE_CACHE,
+ true);
+ css_put(&memcg->css);
+ /*
+ * We disallowed uncharge of pages under migration because mapcount
+ * of the page goes down to zero, temporarly.
+ * Clear the flag and check the page should be charged.
+ */
+ pc = lookup_page_cgroup(oldpage);
+ lock_page_cgroup(pc);
+ ClearPageCgroupMigration(pc);
+ unlock_page_cgroup(pc);
+
+ /*
+ * If a page is a file cache, radix-tree replacement is very atomic
+ * and we can skip this check. When it was an Anon page, its mapcount
+ * goes down to 0. But because we added MIGRATION flage, it's not
+ * uncharged yet. There are several case but page->mapcount check
+ * and USED bit check in mem_cgroup_uncharge_page() will do enough
+ * check. (see prepare_charge() also)
+ */
+ if (anon)
+ mem_cgroup_uncharge_page(used);
+}
+
+/*
+ * At replace page cache, newpage is not under any memcg but it's on
+ * LRU. So, this function doesn't touch res_counter but handles LRU
+ * in correct way. Both pages are locked so we cannot race with uncharge.
+ */
+void mem_cgroup_replace_page_cache(struct page *oldpage,
+ struct page *newpage)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+ enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(oldpage);
+ /* fix accounting on old pages */
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
+ ClearPageCgroupUsed(pc);
+ }
+ unlock_page_cgroup(pc);
+
+ /*
+ * When called from shmem_replace_page(), in some cases the
+ * oldpage has already been charged, and in some cases not.
+ */
+ if (!memcg)
+ return;
+ /*
+ * Even if newpage->mapping was NULL before starting replacement,
+ * the newpage may be on LRU(or pagevec for LRU) already. We lock
+ * LRU while we overwrite pc->mem_cgroup.
+ */
+ __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
+}
+
+#ifdef CONFIG_DEBUG_VM
+static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Can be NULL while feeding pages into the page allocator for
+ * the first time, i.e. during boot or memory hotplug;
+ * or when mem_cgroup_disabled().
+ */
+ if (likely(pc) && PageCgroupUsed(pc))
+ return pc;
+ return NULL;
+}
+
+bool mem_cgroup_bad_page_check(struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return false;
+
+ return lookup_page_cgroup_used(page) != NULL;
+}
+
+void mem_cgroup_print_bad_page(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup_used(page);
+ if (pc) {
+ pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+ pc, pc->flags, pc->mem_cgroup);
+ }
+}
+#endif
+
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int retry_count;
+ u64 memswlimit, memlimit;
+ int ret = 0;
+ int children = mem_cgroup_count_children(memcg);
+ u64 curusage, oldusage;
+ int enlarge;
+
+ /*
+ * For keeping hierarchical_reclaim simple, how long we should retry
+ * is depends on callers. We set our retry-count to be function
+ * of # of children which we should visit in this loop.
+ */
+ retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
+
+ oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+
+ enlarge = 0;
+ while (retry_count) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ /*
+ * Rather than hide all in some function, I do this in
+ * open coded manner. You see what this really does.
+ * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
+ */
+ mutex_lock(&set_limit_mutex);
+ memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ if (memswlimit < val) {
+ ret = -EINVAL;
+ mutex_unlock(&set_limit_mutex);
+ break;
+ }
+
+ memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (memlimit < val)
+ enlarge = 1;
+
+ ret = res_counter_set_limit(&memcg->res, val);
+ if (!ret) {
+ if (memswlimit == val)
+ memcg->memsw_is_minimum = true;
+ else
+ memcg->memsw_is_minimum = false;
+ }
+ mutex_unlock(&set_limit_mutex);
+
+ if (!ret)
+ break;
+
+ mem_cgroup_reclaim(memcg, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_SHRINK);
+ curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ /* Usage is reduced ? */
+ if (curusage >= oldusage)
+ retry_count--;
+ else
+ oldusage = curusage;
+ }
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
+
+ return ret;
+}
+
+static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int retry_count;
+ u64 memlimit, memswlimit, oldusage, curusage;
+ int children = mem_cgroup_count_children(memcg);
+ int ret = -EBUSY;
+ int enlarge = 0;
+
+ /* see mem_cgroup_resize_res_limit */
+ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+ oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ while (retry_count) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ /*
+ * Rather than hide all in some function, I do this in
+ * open coded manner. You see what this really does.
+ * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
+ */
+ mutex_lock(&set_limit_mutex);
+ memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (memlimit > val) {
+ ret = -EINVAL;
+ mutex_unlock(&set_limit_mutex);
+ break;
+ }
+ memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ if (memswlimit < val)
+ enlarge = 1;
+ ret = res_counter_set_limit(&memcg->memsw, val);
+ if (!ret) {
+ if (memlimit == val)
+ memcg->memsw_is_minimum = true;
+ else
+ memcg->memsw_is_minimum = false;
+ }
+ mutex_unlock(&set_limit_mutex);
+
+ if (!ret)
+ break;
+
+ mem_cgroup_reclaim(memcg, GFP_KERNEL,
+ MEM_CGROUP_RECLAIM_NOSWAP |
+ MEM_CGROUP_RECLAIM_SHRINK);
+ curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ /* Usage is reduced ? */
+ if (curusage >= oldusage)
+ retry_count--;
+ else
+ oldusage = curusage;
+ }
+ if (!ret && enlarge)
+ memcg_oom_recover(memcg);
+ return ret;
+}
+
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+ gfp_t gfp_mask,
+ unsigned long *total_scanned)
+{
+ unsigned long nr_reclaimed = 0;
+ struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+ unsigned long reclaimed;
+ int loop = 0;
+ struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
+ unsigned long nr_scanned;
+
+ if (order > 0)
+ return 0;
+
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+ /*
+ * This loop can run a while, specially if mem_cgroup's continuously
+ * keep exceeding their soft limit and putting the system under
+ * pressure
+ */
+ do {
+ if (next_mz)
+ mz = next_mz;
+ else
+ mz = mem_cgroup_largest_soft_limit_node(mctz);
+ if (!mz)
+ break;
+
+ nr_scanned = 0;
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+ gfp_mask, &nr_scanned);
+ nr_reclaimed += reclaimed;
+ *total_scanned += nr_scanned;
+ spin_lock(&mctz->lock);
+
+ /*
+ * If we failed to reclaim anything from this memory cgroup
+ * it is time to move on to the next cgroup
+ */
+ next_mz = NULL;
+ if (!reclaimed) {
+ do {
+ /*
+ * Loop until we find yet another one.
+ *
+ * By the time we get the soft_limit lock
+ * again, someone might have aded the
+ * group back on the RB tree. Iterate to
+ * make sure we get a different mem.
+ * mem_cgroup_largest_soft_limit_node returns
+ * NULL if no other cgroup is present on
+ * the tree
+ */
+ next_mz =
+ __mem_cgroup_largest_soft_limit_node(mctz);
+ if (next_mz == mz)
+ css_put(&next_mz->memcg->css);
+ else /* next_mz == NULL or other memcg */
+ break;
+ } while (1);
+ }
+ __mem_cgroup_remove_exceeded(mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->memcg->res);
+ /*
+ * One school of thought says that we should not add
+ * back the node to the tree if reclaim returns 0.
+ * But our reclaim could return 0, simply because due
+ * to priority we are exposing a smaller subset of
+ * memory to reclaim from. Consider this as a longer
+ * term TODO.
+ */
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
+ spin_unlock(&mctz->lock);
+ css_put(&mz->memcg->css);
+ loop++;
+ /*
+ * Could not reclaim anything and there are no more
+ * mem cgroups to try or we seem to be looping without
+ * reclaiming anything.
+ */
+ if (!nr_reclaimed &&
+ (next_mz == NULL ||
+ loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+ break;
+ } while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->memcg->css);
+ return nr_reclaimed;
+}
+
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
+ * Traverse a specified page_cgroup list and try to drop them all. This doesn't
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
+ * group.
+ */
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+ int node, int zid, enum lru_list lru)
+{
+ struct lruvec *lruvec;
+ unsigned long flags;
+ struct list_head *list;
+ struct page *busy;
+ struct zone *zone;
+
+ zone = &NODE_DATA(node)->node_zones[zid];
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ list = &lruvec->lists[lru];
+
+ busy = NULL;
+ do {
+ struct page_cgroup *pc;
+ struct page *page;
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ if (list_empty(list)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ break;
+ }
+ page = list_entry(list->prev, struct page, lru);
+ if (busy == page) {
+ list_move(&page->lru, list);
+ busy = NULL;
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ continue;
+ }
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+
+ pc = lookup_page_cgroup(page);
+
+ if (mem_cgroup_move_parent(page, pc, memcg)) {
+ /* found lock contention or "pc" is obsolete. */
+ busy = page;
+ } else
+ busy = NULL;
+ cond_resched();
+ } while (!list_empty(list));
+}
+
+/*
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
+ * This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
+ */
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
+{
+ int node, zid;
+ u64 usage;
+
+ do {
+ /* This is for making all *used* pages to be on LRU. */
+ lru_add_drain_all();
+ drain_all_stock_sync(memcg);
+ mem_cgroup_start_move(memcg);
+ for_each_node_state(node, N_MEMORY) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ enum lru_list lru;
+ for_each_lru(lru) {
+ mem_cgroup_force_empty_list(memcg,
+ node, zid, lru);
+ }
+ }
+ }
+ mem_cgroup_end_move(memcg);
+ memcg_oom_recover(memcg);
+ cond_resched();
+
+ /*
+ * Kernel memory may not necessarily be trackable to a specific
+ * process. So they are not migrated, and therefore we can't
+ * expect their value to drop to 0 here.
+ * Having res filled up with kmem only is enough.
+ *
+ * This is a safety check because mem_cgroup_force_empty_list
+ * could have raced with mem_cgroup_replace_page_cache callers
+ * so the lru seemed empty but the page could have been added
+ * right after the check. RES_USAGE should be safe as we always
+ * charge before adding to the LRU.
+ */
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+ res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ } while (usage > 0);
+}
+
+/*
+ * Test whether @memcg has children, dead or alive. Note that this
+ * function doesn't care whether @memcg has use_hierarchy enabled and
+ * returns %true if there are child csses according to the cgroup
+ * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+ bool ret;
+
+ /*
+ * The lock does not prevent addition or deletion of children, but
+ * it prevents a new child from being initialized based on this
+ * parent in css_online(), so it's enough to decide whether
+ * hierarchically inherited attributes can still be changed or not.
+ */
+ lockdep_assert_held(&memcg_create_mutex);
+
+ rcu_read_lock();
+ ret = css_next_child(NULL, &memcg->css);
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+
+ /* we call try-to-free pages for make this cgroup empty */
+ lru_add_drain_all();
+ /* try to free all pages in this cgroup */
+ while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
+ int progress;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
+ false);
+ if (!progress) {
+ nr_retries--;
+ /* maybe some writeback is necessary */
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ }
+
+ return 0;
+}
+
+static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ return mem_cgroup_force_empty(memcg) ?: nbytes;
+}
+
+static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_css(css)->use_hierarchy;
+}
+
+static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ int retval = 0;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
+
+ mutex_lock(&memcg_create_mutex);
+
+ if (memcg->use_hierarchy == val)
+ goto out;
+
+ /*
+ * If parent's use_hierarchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can
+ * occur, provided the current cgroup has no children.
+ *
+ * For the root cgroup, parent_mem is NULL, we allow value to be
+ * set if there are no children.
+ */
+ if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
+ (val == 1 || val == 0)) {
+ if (!memcg_has_children(memcg))
+ memcg->use_hierarchy = val;
+ else
+ retval = -EBUSY;
+ } else
+ retval = -EINVAL;
+
+out:
+ mutex_unlock(&memcg_create_mutex);
+
+ return retval;
+}
+
+
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
+{
+ struct mem_cgroup *iter;
+ long val = 0;
+
+ /* Per-cpu values can be negative, use a signed accumulator */
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
+}
+
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ u64 val;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (!swap)
+ return res_counter_read_u64(&memcg->res, RES_USAGE);
+ else
+ return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ }
+
+ /*
+ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+ * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+ */
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+ if (swap)
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+ return val << PAGE_SHIFT;
+}
+
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ u64 val;
+ int name;
+ enum res_type type;
+
+ type = MEMFILE_TYPE(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ switch (type) {
+ case _MEM:
+ if (name == RES_USAGE)
+ val = mem_cgroup_usage(memcg, false);
+ else
+ val = res_counter_read_u64(&memcg->res, name);
+ break;
+ case _MEMSWAP:
+ if (name == RES_USAGE)
+ val = mem_cgroup_usage(memcg, true);
+ else
+ val = res_counter_read_u64(&memcg->memsw, name);
+ break;
+ case _KMEM:
+ val = res_counter_read_u64(&memcg->kmem, name);
+ break;
+ default:
+ BUG();
+ }
+
+ return val;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
+ /*
+ * For simplicity, we won't allow this to be disabled. It also can't
+ * be changed if the cgroup has children already, or if tasks had
+ * already joined.
+ *
+ * If tasks join before we set the limit, a person looking at
+ * kmem.usage_in_bytes will have no way to determine when it took
+ * place, which makes the value quite meaningless.
+ *
+ * After it first became limited, changes in the value of the limit are
+ * of course permitted.
+ */
+ mutex_lock(&memcg_create_mutex);
+ if (cgroup_has_tasks(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
+
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ mutex_lock(&memcg_slab_mutex);
+ err = memcg_update_all_caches(memcg_id + 1);
+ mutex_unlock(&memcg_slab_mutex);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
+out:
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
+ return ret;
+}
+
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+ int ret = 0;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&activate_kmem_mutex);
+ /*
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
+ */
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+/*
+ * The user of this function is...
+ * RES_LIMIT.
+ */
+static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ enum res_type type;
+ int name;
+ unsigned long long val;
+ int ret;
+
+ buf = strstrip(buf);
+ type = MEMFILE_TYPE(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
+
+ switch (name) {
+ case RES_LIMIT:
+ if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buf, &val);
+ if (ret)
+ break;
+ if (type == _MEM)
+ ret = mem_cgroup_resize_limit(memcg, val);
+ else if (type == _MEMSWAP)
+ ret = mem_cgroup_resize_memsw_limit(memcg, val);
+ else if (type == _KMEM)
+ ret = memcg_update_kmem_limit(memcg, val);
+ else
+ return -EINVAL;
+ break;
+ case RES_SOFT_LIMIT:
+ ret = res_counter_memparse_write_strategy(buf, &val);
+ if (ret)
+ break;
+ /*
+ * For memsw, soft limits are hard to implement in terms
+ * of semantics, for now, we support soft limits for
+ * control without swap
+ */
+ if (type == _MEM)
+ ret = res_counter_set_soft_limit(&memcg->res, val);
+ else
+ ret = -EINVAL;
+ break;
+ default:
+ ret = -EINVAL; /* should be BUG() ? */
+ break;
+ }
+ return ret ?: nbytes;
+}
+
+static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
+ unsigned long long *mem_limit, unsigned long long *memsw_limit)
+{
+ unsigned long long min_limit, min_memsw_limit, tmp;
+
+ min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ if (!memcg->use_hierarchy)
+ goto out;
+
+ while (memcg->css.parent) {
+ memcg = mem_cgroup_from_css(memcg->css.parent);
+ if (!memcg->use_hierarchy)
+ break;
+ tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ min_limit = min(min_limit, tmp);
+ tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ min_memsw_limit = min(min_memsw_limit, tmp);
+ }
+out:
+ *mem_limit = min_limit;
+ *memsw_limit = min_memsw_limit;
+}
+
+static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int name;
+ enum res_type type;
+
+ type = MEMFILE_TYPE(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
+
+ switch (name) {
+ case RES_MAX_USAGE:
+ if (type == _MEM)
+ res_counter_reset_max(&memcg->res);
+ else if (type == _MEMSWAP)
+ res_counter_reset_max(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_max(&memcg->kmem);
+ else
+ return -EINVAL;
+ break;
+ case RES_FAILCNT:
+ if (type == _MEM)
+ res_counter_reset_failcnt(&memcg->res);
+ else if (type == _MEMSWAP)
+ res_counter_reset_failcnt(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_failcnt(&memcg->kmem);
+ else
+ return -EINVAL;
+ break;
+ }
+
+ return nbytes;
+}
+
+static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_css(css)->move_charge_at_immigrate;
+}
+
+#ifdef CONFIG_MMU
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ if (val >= (1 << NR_MOVE_TYPE))
+ return -EINVAL;
+
+ /*
+ * No kind of locking is needed in here, because ->can_attach() will
+ * check this value once in the beginning of the process, and then carry
+ * on with stale data. This means that changes to this value will only
+ * affect task migrations starting after the change.
+ */
+ memcg->move_charge_at_immigrate = val;
+ return 0;
+}
+#else
+static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ return -ENOSYS;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
+{
+ struct numa_stat {
+ const char *name;
+ unsigned int lru_mask;
+ };
+
+ static const struct numa_stat stats[] = {
+ { "total", LRU_ALL },
+ { "file", LRU_ALL_FILE },
+ { "anon", LRU_ALL_ANON },
+ { "unevictable", BIT(LRU_UNEVICTABLE) },
+ };
+ const struct numa_stat *stat;
+ int nid;
+ unsigned long nr;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ seq_printf(m, "%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
+ }
+
+ for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ struct mem_cgroup *iter;
+
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ for_each_node_state(nid, N_MEMORY) {
+ nr = 0;
+ for_each_mem_cgroup_tree(iter, memcg)
+ nr += mem_cgroup_node_nr_lru_pages(
+ iter, nid, stat->lru_mask);
+ seq_printf(m, " N%d=%lu", nid, nr);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
+static inline void mem_cgroup_lru_names_not_uptodate(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+}
+
+static int memcg_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *mi;
+ unsigned int i;
+
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ continue;
+ seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+ mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
+ }
+
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
+ mem_cgroup_read_events(memcg, i));
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+ mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+
+ /* Hierarchical information */
+ {
+ unsigned long long limit, memsw_limit;
+ memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
+ seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
+ if (do_swap_account)
+ seq_printf(m, "hierarchical_memsw_limit %llu\n",
+ memsw_limit);
+ }
+
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ long long val = 0;
+
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ continue;
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+ seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+ }
+
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_read_events(mi, i);
+ seq_printf(m, "total_%s %llu\n",
+ mem_cgroup_events_names[i], val);
+ }
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ unsigned long long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
+ }
+
+#ifdef CONFIG_DEBUG_VM
+ {
+ int nid, zid;
+ struct mem_cgroup_per_zone *mz;
+ struct zone_reclaim_stat *rstat;
+ unsigned long recent_rotated[2] = {0, 0};
+ unsigned long recent_scanned[2] = {0, 0};
+
+ for_each_online_node(nid)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ rstat = &mz->lruvec.reclaim_stat;
+
+ recent_rotated[0] += rstat->recent_rotated[0];
+ recent_rotated[1] += rstat->recent_rotated[1];
+ recent_scanned[0] += rstat->recent_scanned[0];
+ recent_scanned[1] += rstat->recent_scanned[1];
+ }
+ seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
+ seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
+ seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
+ seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ }
+#endif
+
+ return 0;
+}
+
+static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return mem_cgroup_swappiness(memcg);
+}
+
+static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ if (val > 100)
+ return -EINVAL;
+
+ if (css->parent)
+ memcg->swappiness = val;
+ else
+ vm_swappiness = val;
+
+ return 0;
+}
+
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+ struct mem_cgroup_threshold_ary *t;
+ u64 usage;
+ int i;
+
+ rcu_read_lock();
+ if (!swap)
+ t = rcu_dereference(memcg->thresholds.primary);
+ else
+ t = rcu_dereference(memcg->memsw_thresholds.primary);
+
+ if (!t)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, swap);
+
+ /*
+ * current_threshold points to threshold just below or equal to usage.
+ * If it's not true, a threshold was crossed after last
+ * call of __mem_cgroup_threshold().
+ */
+ i = t->current_threshold;
+
+ /*
+ * Iterate backward over array of thresholds starting from
+ * current_threshold and check if a threshold is crossed.
+ * If none of thresholds below usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* i = current_threshold + 1 */
+ i++;
+
+ /*
+ * Iterate forward over array of thresholds starting from
+ * current_threshold+1 and check if a threshold is crossed.
+ * If none of thresholds above usage is crossed, we read
+ * only one element of the array here.
+ */
+ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+ eventfd_signal(t->entries[i].eventfd, 1);
+
+ /* Update current_threshold */
+ t->current_threshold = i - 1;
+unlock:
+ rcu_read_unlock();
+}
+
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_swap_account)
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+static int compare_thresholds(const void *a, const void *b)
+{
+ const struct mem_cgroup_threshold *_a = a;
+ const struct mem_cgroup_threshold *_b = b;
+
+ if (_a->threshold > _b->threshold)
+ return 1;
+
+ if (_a->threshold < _b->threshold)
+ return -1;
+
+ return 0;
+}
+
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_eventfd_list *ev;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry(ev, &memcg->oom_notify, list)
+ eventfd_signal(ev->eventfd, 1);
+
+ spin_unlock(&memcg_oom_lock);
+ return 0;
+}
+
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ mem_cgroup_oom_notify_cb(iter);
+}
+
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ u64 threshold, usage;
+ int i, size, ret;
+
+ ret = res_counter_memparse_write_strategy(args, &threshold);
+ if (ret)
+ return ret;
+
+ mutex_lock(&memcg->thresholds_lock);
+
+ if (type == _MEM)
+ thresholds = &memcg->thresholds;
+ else if (type == _MEMSWAP)
+ thresholds = &memcg->memsw_thresholds;
+ else
+ BUG();
+
+ usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+
+ /* Check if a threshold crossed before adding a new one */
+ if (thresholds->primary)
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ size = thresholds->primary ? thresholds->primary->size + 1 : 1;
+
+ /* Allocate memory for new array of thresholds */
+ new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
+ GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ new->size = size;
+
+ /* Copy thresholds (if any) to new array */
+ if (thresholds->primary) {
+ memcpy(new->entries, thresholds->primary->entries, (size - 1) *
+ sizeof(struct mem_cgroup_threshold));
+ }
+
+ /* Add new threshold */
+ new->entries[size - 1].eventfd = eventfd;
+ new->entries[size - 1].threshold = threshold;
+
+ /* Sort thresholds. Registering of new threshold isn't time-critical */
+ sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ compare_thresholds, NULL);
+
+ /* Find current threshold */
+ new->current_threshold = -1;
+ for (i = 0; i < size; i++) {
+ if (new->entries[i].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used until
+ * rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ } else
+ break;
+ }
+
+ /* Free old spare buffer and save old primary buffer as spare */
+ kfree(thresholds->spare);
+ thresholds->spare = thresholds->primary;
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+
+ return ret;
+}
+
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
+{
+ struct mem_cgroup_thresholds *thresholds;
+ struct mem_cgroup_threshold_ary *new;
+ u64 usage;
+ int i, j, size;
+
+ mutex_lock(&memcg->thresholds_lock);
+ if (type == _MEM)
+ thresholds = &memcg->thresholds;
+ else if (type == _MEMSWAP)
+ thresholds = &memcg->memsw_thresholds;
+ else
+ BUG();
+
+ if (!thresholds->primary)
+ goto unlock;
+
+ usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+
+ /* Calculate new number of threshold */
+ size = 0;
+ for (i = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd != eventfd)
+ size++;
+ }
+
+ new = thresholds->spare;
+
+ /* Set thresholds array to NULL if we don't have thresholds */
+ if (!size) {
+ kfree(new);
+ new = NULL;
+ goto swap_buffers;
+ }
+
+ new->size = size;
+
+ /* Copy thresholds and find current threshold */
+ new->current_threshold = -1;
+ for (i = 0, j = 0; i < thresholds->primary->size; i++) {
+ if (thresholds->primary->entries[i].eventfd == eventfd)
+ continue;
+
+ new->entries[j] = thresholds->primary->entries[i];
+ if (new->entries[j].threshold <= usage) {
+ /*
+ * new->current_threshold will not be used
+ * until rcu_assign_pointer(), so it's safe to increment
+ * it here.
+ */
+ ++new->current_threshold;
+ }
+ j++;
+ }
+
+swap_buffers:
+ /* Swap primary and spare array */
+ thresholds->spare = thresholds->primary;
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
+
+ rcu_assign_pointer(thresholds->primary, new);
+
+ /* To be sure that nobody uses thresholds */
+ synchronize_rcu();
+unlock:
+ mutex_unlock(&memcg->thresholds_lock);
+}
+
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct mem_cgroup_eventfd_list *event;
+
+ event = kmalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ spin_lock(&memcg_oom_lock);
+
+ event->eventfd = eventfd;
+ list_add(&event->list, &memcg->oom_notify);
+
+ /* already in OOM ? */
+ if (atomic_read(&memcg->under_oom))
+ eventfd_signal(eventfd, 1);
+ spin_unlock(&memcg_oom_lock);
+
+ return 0;
+}
+
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct mem_cgroup_eventfd_list *ev, *tmp;
+
+ spin_lock(&memcg_oom_lock);
+
+ list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
+ if (ev->eventfd == eventfd) {
+ list_del(&ev->list);
+ kfree(ev);
+ }
+ }
+
+ spin_unlock(&memcg_oom_lock);
+}
+
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ return 0;
+}
+
+static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ /* cannot set to root cgroup and only 0 and 1 are allowed */
+ if (!css->parent || !((val == 0) || (val == 1)))
+ return -EINVAL;
+
+ memcg->oom_kill_disable = val;
+ if (!val)
+ memcg_oom_recover(memcg);
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ int ret;
+
+ memcg->kmemcg_id = -1;
+ ret = memcg_propagate_kmem(memcg);
+ if (ret)
+ return ret;
+
+ return mem_cgroup_sockets_init(memcg, ss);
+}
+
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+ mem_cgroup_sockets_destroy(memcg);
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+{
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ /*
+ * kmem charges can outlive the cgroup. In the case of slab
+ * pages, for instance, a page contain objects from various
+ * processes. As we prevent from taking a reference for every
+ * such allocation we have to be careful when doing uncharge
+ * (see memcg_uncharge_kmem) and here during offlining.
+ *
+ * The idea is that that only the _last_ uncharge which sees
+ * the dead memcg will drop the last reference. An additional
+ * reference is taken here before the group is marked dead
+ * which is then paired with css_put during uncharge resp. here.
+ *
+ * Although this might sound strange as this path is called from
+ * css_offline() when the referencemight have dropped down to 0 and
+ * shouldn't be incremented anymore (css_tryget_online() would
+ * fail) we do not have other options because of the kmem
+ * allocations lifetime.
+ */
+ css_get(&memcg->css);
+
+ memcg_kmem_mark_dead(memcg);
+
+ if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+ return;
+
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ css_put(&memcg->css);
+}
+#else
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+ return 0;
+}
+
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ buf = strstrip(buf);
+
+ efd = simple_strtoul(buf, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buf = endp + 1;
+
+ cfd = simple_strtoul(buf, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buf = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
+ &memory_cgrp_subsys);
+ ret = -EINVAL;
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
+ goto out_put_cfile;
+ }
+
+ ret = event->register_event(memcg, event->eventfd, buf);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return nbytes;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
+static struct cftype mem_cgroup_files[] = {
+ {
+ .name = "usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "soft_limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "failcnt",
+ .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "stat",
+ .seq_show = memcg_stat_show,
+ },
+ {
+ .name = "force_empty",
+ .write = mem_cgroup_force_empty_write,
+ },
+ {
+ .name = "use_hierarchy",
+ .flags = CFTYPE_INSANE,
+ .write_u64 = mem_cgroup_hierarchy_write,
+ .read_u64 = mem_cgroup_hierarchy_read,
+ },
+ {
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
+ {
+ .name = "swappiness",
+ .read_u64 = mem_cgroup_swappiness_read,
+ .write_u64 = mem_cgroup_swappiness_write,
+ },
+ {
+ .name = "move_charge_at_immigrate",
+ .read_u64 = mem_cgroup_move_charge_read,
+ .write_u64 = mem_cgroup_move_charge_write,
+ },
+ {
+ .name = "oom_control",
+ .seq_show = mem_cgroup_oom_control_read,
+ .write_u64 = mem_cgroup_oom_control_write,
+ .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
+ },
+ {
+ .name = "pressure_level",
+ },
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memcg_numa_stat_show,
+ },
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+ {
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.failcnt",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+#ifdef CONFIG_SLABINFO
+ {
+ .name = "kmem.slabinfo",
+ .seq_show = mem_cgroup_slabinfo_read,
+ },
+#endif
+#endif
+ { }, /* terminate */
+};
+
+#ifdef CONFIG_MEMCG_SWAP
+static struct cftype memsw_cgroup_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+#endif
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup_per_zone *mz;
+ int zone, tmp = node;
+ /*
+ * This routine is called against possible nodes.
+ * But it's BUG to call kmalloc() against offline node.
+ *
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ if (!pn)
+ return 1;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = &pn->zoneinfo[zone];
+ lruvec_init(&mz->lruvec);
+ mz->usage_in_excess = 0;
+ mz->on_tree = false;
+ mz->memcg = memcg;
+ }
+ memcg->nodeinfo[node] = pn;
+ return 0;
+}
+
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+{
+ kfree(memcg->nodeinfo[node]);
+}
+
+static struct mem_cgroup *mem_cgroup_alloc(void)
+{
+ struct mem_cgroup *memcg;
+ size_t size;
+
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
+ if (!memcg)
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto out_free;
+ spin_lock_init(&memcg->pcp_counter_lock);
+ return memcg;
+
+out_free:
+ kfree(memcg);
+ return NULL;
+}
+
+/*
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
+ */
+
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
+{
+ int node;
+
+ mem_cgroup_remove_from_trees(memcg);
+
+ for_each_node(node)
+ free_mem_cgroup_per_zone_info(memcg, node);
+
+ free_percpu(memcg->stat);
+
+ /*
+ * We need to make sure that (at least for now), the jump label
+ * destruction code runs outside of the cgroup lock. This is because
+ * get_online_cpus(), which is called from the static_branch update,
+ * can't be called inside the cgroup_lock. cpusets are the ones
+ * enforcing this dependency, so if they ever change, we might as well.
+ *
+ * schedule_work() will guarantee this happens. Be careful if you need
+ * to move this code around, and make sure it is outside
+ * the cgroup_lock.
+ */
+ disarm_static_keys(memcg);
+ kfree(memcg);
+}
+
+/*
+ * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
+ */
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+ if (!memcg->res.parent)
+ return NULL;
+ return mem_cgroup_from_res_counter(memcg->res.parent, res);
+}
+EXPORT_SYMBOL(parent_mem_cgroup);
+
+static void __init mem_cgroup_soft_limit_tree_init(void)
+{
+ struct mem_cgroup_tree_per_node *rtpn;
+ struct mem_cgroup_tree_per_zone *rtpz;
+ int tmp, node, zone;
+
+ for_each_node(node) {
+ tmp = node;
+ if (!node_state(node, N_NORMAL_MEMORY))
+ tmp = -1;
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+ BUG_ON(!rtpn);
+
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ }
+}
+
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;
+ int node;
+
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);
+
+ for_each_node(node)
+ if (alloc_mem_cgroup_per_zone_info(memcg, node))
+ goto free_out;
+
+ /* root ? */
+ if (parent_css == NULL) {
+ root_mem_cgroup = memcg;
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
+ }
+
+ memcg->last_scanned_node = MAX_NUMNODES;
+ INIT_LIST_HEAD(&memcg->oom_notify);
+ memcg->move_charge_at_immigrate = 0;
+ mutex_init(&memcg->thresholds_lock);
+ spin_lock_init(&memcg->move_lock);
+ vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
+
+ return &memcg->css;
+
+free_out:
+ __mem_cgroup_free(memcg);
+ return ERR_PTR(error);
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
+
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&memcg_create_mutex);
+
+ memcg->use_hierarchy = parent->use_hierarchy;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+
+ if (parent->use_hierarchy) {
+ res_counter_init(&memcg->res, &parent->res);
+ res_counter_init(&memcg->memsw, &parent->memsw);
+ res_counter_init(&memcg->kmem, &parent->kmem);
+
+ /*
+ * No need to take a reference to the parent because cgroup
+ * core guarantees its existence.
+ */
+ } else {
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
+ /*
+ * Deeper hierachy with use_hierarchy == false doesn't make
+ * much sense so let cgroup subsystem know about this
+ * unfortunate state in our controller.
+ */
+ if (parent != root_mem_cgroup)
+ memory_cgrp_subsys.broken_hierarchy = true;
+ }
+ mutex_unlock(&memcg_create_mutex);
+
+ return memcg_init_kmem(memcg, &memory_cgrp_subsys);
+}
+
+/*
+ * Announce all parents that a group from their hierarchy is gone.
+ */
+static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = memcg;
+
+ while ((parent = parent_mem_cgroup(parent)))
+ mem_cgroup_iter_invalidate(parent);
+
+ /*
+ * if the root memcg is not hierarchical we have to check it
+ * explicitely.
+ */
+ if (!root_mem_cgroup->use_hierarchy)
+ mem_cgroup_iter_invalidate(root_mem_cgroup);
+}
+
+static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+ struct cgroup_subsys_state *iter;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+
+ kmem_cgroup_css_offline(memcg);
+
+ mem_cgroup_invalidate_reclaim_iterators(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ css_for_each_descendant_post(iter, css)
+ mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+
+ memcg_unregister_all_caches(memcg);
+ vmpressure_cleanup(&memcg->vmpressure);
+}
+
+static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget_online() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget_online()
+ * rcu_read_unlock()
+ * disable css_tryget_online()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
+
+ memcg_destroy_kmem(memcg);
+ __mem_cgroup_free(memcg);
+}
+
+#ifdef CONFIG_MMU
+/* Handlers for move charge at task migration. */
+#define PRECHARGE_COUNT_AT_ONCE 256
+static int mem_cgroup_do_precharge(unsigned long count)
+{
+ int ret = 0;
+ int batch_count = PRECHARGE_COUNT_AT_ONCE;
+ struct mem_cgroup *memcg = mc.to;
+
+ if (mem_cgroup_is_root(memcg)) {
+ mc.precharge += count;
+ /* we don't need css_get for root */
+ return ret;
+ }
+ /* try to charge at once */
+ if (count > 1) {
+ struct res_counter *dummy;
+ /*
+ * "memcg" cannot be under rmdir() because we've already checked
+ * by cgroup_lock_live_cgroup() that it is not removed and we
+ * are still under the same cgroup_mutex. So we can postpone
+ * css_get().
+ */
+ if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
+ goto one_by_one;
+ if (do_swap_account && res_counter_charge(&memcg->memsw,
+ PAGE_SIZE * count, &dummy)) {
+ res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
+ goto one_by_one;
+ }
+ mc.precharge += count;
+ return ret;
+ }
+one_by_one:
+ /* fall back to one by one charge */
+ while (count--) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ if (!batch_count--) {
+ batch_count = PRECHARGE_COUNT_AT_ONCE;
+ cond_resched();
+ }
+ ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
+ if (ret)
+ /* mem_cgroup_clear_mc() will do uncharge later */
+ return ret;
+ mc.precharge++;
+ }
+ return ret;
+}
+
+/**
+ * get_mctgt_type - get target type of moving charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ * move charge. if @target is not NULL, the page is stored in target->page
+ * with extra refcnt got(Callers should handle it).
+ * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ * target for charge migration. if @target is not NULL, the entry is stored
+ * in target->ent.
+ *
+ * Called with pte lock held.
+ */
+union mc_target {
+ struct page *page;
+ swp_entry_t ent;
+};
+
+enum mc_target_type {
+ MC_TARGET_NONE = 0,
+ MC_TARGET_PAGE,
+ MC_TARGET_SWAP,
+};
+
+static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent)
+{
+ struct page *page = vm_normal_page(vma, addr, ptent);
+
+ if (!page || !page_mapped(page))
+ return NULL;
+ if (PageAnon(page)) {
+ /* we don't move shared anon */
+ if (!move_anon())
+ return NULL;
+ } else if (!move_file())
+ /* we ignore mapcount for file pages */
+ return NULL;
+ if (!get_page_unless_zero(page))
+ return NULL;
+
+ return page;
+}
+
+#ifdef CONFIG_SWAP
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ struct page *page = NULL;
+ swp_entry_t ent = pte_to_swp_entry(ptent);
+
+ if (!move_anon() || non_swap_entry(ent))
+ return NULL;
+ /*
+ * Because lookup_swap_cache() updates some statistics counter,
+ * we call find_get_page() with swapper_space directly.
+ */
+ page = find_get_page(swap_address_space(ent), ent.val);
+ if (do_swap_account)
+ entry->val = ent.val;
+
+ return page;
+}
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ return NULL;
+}
+#endif
+
+static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+ struct page *page = NULL;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+
+ if (!vma->vm_file) /* anonymous vma */
+ return NULL;
+ if (!move_file())
+ return NULL;
+
+ mapping = vma->vm_file->f_mapping;
+ if (pte_none(ptent))
+ pgoff = linear_page_index(vma, addr);
+ else /* pte_file(ptent) is true */
+ pgoff = pte_to_pgoff(ptent);
+
+ /* page is moved even if it's not RSS of this task(page-faulted). */
+#ifdef CONFIG_SWAP
+ /* shmem/tmpfs may report page out on swap: account for that too. */
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swp;
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
+#endif
+ return page;
+}
+
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
+ unsigned long addr, pte_t ptent, union mc_target *target)
+{
+ struct page *page = NULL;
+ struct page_cgroup *pc;
+ enum mc_target_type ret = MC_TARGET_NONE;
+ swp_entry_t ent = { .val = 0 };
+
+ if (pte_present(ptent))
+ page = mc_handle_present_pte(vma, addr, ptent);
+ else if (is_swap_pte(ptent))
+ page = mc_handle_swap_pte(vma, addr, ptent, &ent);
+ else if (pte_none(ptent) || pte_file(ptent))
+ page = mc_handle_file_pte(vma, addr, ptent, &ent);
+
+ if (!page && !ent.val)
+ return ret;
+ if (page) {
+ pc = lookup_page_cgroup(page);
+ /*
+ * Do only loose check w/o page_cgroup lock.
+ * mem_cgroup_move_account() checks the pc is valid or not under
+ * the lock.
+ */
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target)
+ target->page = page;
+ }
+ if (!ret || !target)
+ put_page(page);
+ }
+ /* There is a swap entry and a page doesn't exist or isn't charged */
+ if (ent.val && !ret &&
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
+ ret = MC_TARGET_SWAP;
+ if (target)
+ target->ent = ent;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
+ */
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ struct page *page = NULL;
+ struct page_cgroup *pc;
+ enum mc_target_type ret = MC_TARGET_NONE;
+
+ page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
+ if (!move_anon())
+ return ret;
+ pc = lookup_page_cgroup(page);
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ ret = MC_TARGET_PAGE;
+ if (target) {
+ get_page(page);
+ target->page = page;
+ }
+ }
+ return ret;
+}
+#else
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+ return MC_TARGET_NONE;
+}
+#endif
+
+static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
+ mc.precharge += HPAGE_PMD_NR;
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE)
+ if (get_mctgt_type(vma, addr, *pte, NULL))
+ mc.precharge++; /* increment precharge temporarily */
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ return 0;
+}
+
+static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
+{
+ unsigned long precharge;
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &mem_cgroup_count_precharge_walk);
+ }
+ up_read(&mm->mmap_sem);
+
+ precharge = mc.precharge;
+ mc.precharge = 0;
+
+ return precharge;
+}
+
+static int mem_cgroup_precharge_mc(struct mm_struct *mm)
+{
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
+}
+
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+ int i;
+
+ /* we must uncharge all the leftover precharges from mc.to */
+ if (mc.precharge) {
+ __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+ mc.precharge = 0;
+ }
+ /*
+ * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
+ * we must uncharge here.
+ */
+ if (mc.moved_charge) {
+ __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+ mc.moved_charge = 0;
+ }
+ /* we must fixup refcnts and charges */
+ if (mc.moved_swap) {
+ /* uncharge swap account from the old cgroup */
+ if (!mem_cgroup_is_root(mc.from))
+ res_counter_uncharge(&mc.from->memsw,
+ PAGE_SIZE * mc.moved_swap);
+
+ for (i = 0; i < mc.moved_swap; i++)
+ css_put(&mc.from->css);
+
+ if (!mem_cgroup_is_root(mc.to)) {
+ /*
+ * we charged both to->res and to->memsw, so we should
+ * uncharge to->res.
+ */
+ res_counter_uncharge(&mc.to->res,
+ PAGE_SIZE * mc.moved_swap);
+ }
+ /* we've already done css_get(mc.to) */
+ mc.moved_swap = 0;
+ }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
+ spin_lock(&mc.lock);
+ mc.from = NULL;
+ mc.to = NULL;
+ spin_unlock(&mc.lock);
+ mem_cgroup_end_move(from);
+}
+
+static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *p = cgroup_taskset_first(tset);
+ int ret = 0;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ unsigned long move_charge_at_immigrate;
+
+ /*
+ * We are now commited to this value whatever it is. Changes in this
+ * tunable will only affect upcoming migrations, not the current one.
+ * So we need to save it, and keep it going.
+ */
+ move_charge_at_immigrate = memcg->move_charge_at_immigrate;
+ if (move_charge_at_immigrate) {
+ struct mm_struct *mm;
+ struct mem_cgroup *from = mem_cgroup_from_task(p);
+
+ VM_BUG_ON(from == memcg);
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
+ mem_cgroup_start_move(from);
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.immigrate_flags = move_charge_at_immigrate;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
+ }
+ mmput(mm);
+ }
+ return ret;
+}
+
+static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ mem_cgroup_clear_mc();
+}
+
+static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ int ret = 0;
+ struct vm_area_struct *vma = walk->private;
+ pte_t *pte;
+ spinlock_t *ptl;
+ enum mc_target_type target_type;
+ union mc_target target;
+ struct page *page;
+ struct page_cgroup *pc;
+
+ /*
+ * We don't take compound_lock() here but no race with splitting thp
+ * happens because:
+ * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
+ * under splitting, which means there's no concurrent thp split,
+ * - if another thread runs into split_huge_page() just after we
+ * entered this if-block, the thread must wait for page table lock
+ * to be unlocked in __split_huge_page_splitting(), where the main
+ * part of thp split is not executed yet.
+ */
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (mc.precharge < HPAGE_PMD_NR) {
+ spin_unlock(ptl);
+ return 0;
+ }
+ target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
+ if (target_type == MC_TARGET_PAGE) {
+ page = target.page;
+ if (!isolate_lru_page(page)) {
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ pc, mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ putback_lru_page(page);
+ }
+ put_page(page);
+ }
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+retry:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (; addr != end; addr += PAGE_SIZE) {
+ pte_t ptent = *(pte++);
+ swp_entry_t ent;
+
+ if (!mc.precharge)
+ break;
+
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_PAGE:
+ page = target.page;
+ if (isolate_lru_page(page))
+ goto put;
+ pc = lookup_page_cgroup(page);
+ if (!mem_cgroup_move_account(page, 1, pc,
+ mc.from, mc.to)) {
+ mc.precharge--;
+ /* we uncharge from mc.from later. */
+ mc.moved_charge++;
+ }
+ putback_lru_page(page);
+put: /* get_mctgt_type() gets the page */
+ put_page(page);
+ break;
+ case MC_TARGET_SWAP:
+ ent = target.ent;
+ if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
+ mc.precharge--;
+ /* we fixup refcnts and charges later. */
+ mc.moved_swap++;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+
+ if (addr != end) {
+ /*
+ * We have consumed all precharges we got in can_attach().
+ * We try charge one by one, but don't do any additional
+ * charges to mc.to if we have failed in charge once in attach()
+ * phase.
+ */
+ ret = mem_cgroup_do_precharge(1);
+ if (!ret)
+ goto retry;
+ }
+
+ return ret;
+}
+
+static void mem_cgroup_move_charge(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ lru_add_drain_all();
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ int ret;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ .private = vma,
+ };
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ ret = walk_page_range(vma->vm_start, vma->vm_end,
+ &mem_cgroup_move_charge_walk);
+ if (ret)
+ /*
+ * means we have consumed all precharges and failed in
+ * doing additional charge. Just abandon here.
+ */
+ break;
+ }
+ up_read(&mm->mmap_sem);
+}
+
+static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *p = cgroup_taskset_first(tset);
+ struct mm_struct *mm = get_task_mm(p);
+
+ if (mm) {
+ if (mc.to)
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
+ if (mc.to)
+ mem_cgroup_clear_mc();
+}
+#else /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+}
+#endif
+
+/*
+ * Cgroup retains root cgroups across [un]mount cycles making it necessary
+ * to verify sane_behavior flag on each mount attempt.
+ */
+static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
+{
+ /*
+ * use_hierarchy is forced with sane_behavior. cgroup core
+ * guarantees that @root doesn't have any children, so turning it
+ * on for the root memcg is enough.
+ */
+ if (cgroup_sane_behavior(root_css->cgroup))
+ mem_cgroup_from_css(root_css)->use_hierarchy = true;
+}
+
+struct cgroup_subsys memory_cgrp_subsys = {
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .attach = mem_cgroup_move_task,
+ .bind = mem_cgroup_bind,
+ .base_cftypes = mem_cgroup_files,
+ .early_init = 0,
+};
+
+#ifdef CONFIG_MEMCG_SWAP
+static int __init enable_swap_account(char *s)
+{
+ if (!strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static void __init memsw_file_init(void)
+{
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
+}
+
+static void __init enable_swap_cgroup(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account) {
+ do_swap_account = 1;
+ memsw_file_init();
+ }
+}
+
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
+
+/*
+ * subsys_initcall() for memory controller.
+ *
+ * Some parts like hotcpu_notifier() have to be initialized from this context
+ * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
+ * everything that doesn't depend on a specific mem_cgroup structure should
+ * be initialized from here.
+ */
+static int __init mem_cgroup_init(void)
+{
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+ enable_swap_cgroup();
+ mem_cgroup_soft_limit_tree_init();
+ memcg_stock_init();
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 00000000000..a013bc94ebb
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,1738 @@
+/*
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Authors: Andi Kleen, Fengguang Wu
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ *
+ * High level machine check handler. Handles pages reported by the
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
+ * failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
+ *
+ * Handles page cache pages in various states. The tricky part
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
+ */
+
+/*
+ * Notebook:
+ * - hugetlb needs more code
+ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
+ * - pass bad pages to kdump next kernel
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/kernel-page-flags.h>
+#include <linux/sched.h>
+#include <linux/ksm.h>
+#include <linux/rmap.h>
+#include <linux/export.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/suspend.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
+#include <linux/kfifo.h>
+#include "internal.h"
+
+int sysctl_memory_failure_early_kill __read_mostly = 0;
+
+int sysctl_memory_failure_recovery __read_mostly = 1;
+
+atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+
+#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+
+u32 hwpoison_filter_enable = 0;
+u32 hwpoison_filter_dev_major = ~0U;
+u32 hwpoison_filter_dev_minor = ~0U;
+u64 hwpoison_filter_flags_mask;
+u64 hwpoison_filter_flags_value;
+EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
+
+static int hwpoison_filter_dev(struct page *p)
+{
+ struct address_space *mapping;
+ dev_t dev;
+
+ if (hwpoison_filter_dev_major == ~0U &&
+ hwpoison_filter_dev_minor == ~0U)
+ return 0;
+
+ /*
+ * page_mapping() does not accept slab pages.
+ */
+ if (PageSlab(p))
+ return -EINVAL;
+
+ mapping = page_mapping(p);
+ if (mapping == NULL || mapping->host == NULL)
+ return -EINVAL;
+
+ dev = mapping->host->i_sb->s_dev;
+ if (hwpoison_filter_dev_major != ~0U &&
+ hwpoison_filter_dev_major != MAJOR(dev))
+ return -EINVAL;
+ if (hwpoison_filter_dev_minor != ~0U &&
+ hwpoison_filter_dev_minor != MINOR(dev))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int hwpoison_filter_flags(struct page *p)
+{
+ if (!hwpoison_filter_flags_mask)
+ return 0;
+
+ if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+ hwpoison_filter_flags_value)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef CONFIG_MEMCG_SWAP
+u64 hwpoison_filter_memcg;
+EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
+static int hwpoison_filter_task(struct page *p)
+{
+ struct mem_cgroup *mem;
+ struct cgroup_subsys_state *css;
+ unsigned long ino;
+
+ if (!hwpoison_filter_memcg)
+ return 0;
+
+ mem = try_get_mem_cgroup_from_page(p);
+ if (!mem)
+ return -EINVAL;
+
+ css = mem_cgroup_css(mem);
+ ino = cgroup_ino(css->cgroup);
+ css_put(css);
+
+ if (!ino || ino != hwpoison_filter_memcg)
+ return -EINVAL;
+
+ return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+
+int hwpoison_filter(struct page *p)
+{
+ if (!hwpoison_filter_enable)
+ return 0;
+
+ if (hwpoison_filter_dev(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_flags(p))
+ return -EINVAL;
+
+ if (hwpoison_filter_task(p))
+ return -EINVAL;
+
+ return 0;
+}
+#else
+int hwpoison_filter(struct page *p)
+{
+ return 0;
+}
+#endif
+
+EXPORT_SYMBOL_GPL(hwpoison_filter);
+
+/*
+ * Send all the processes who have the page mapped a signal.
+ * ``action optional'' if they are not immediately affected by the error
+ * ``action required'' if error happened in current execution context
+ */
+static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
+ unsigned long pfn, struct page *page, int flags)
+{
+ struct siginfo si;
+ int ret;
+
+ printk(KERN_ERR
+ "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
+ si.si_signo = SIGBUS;
+ si.si_errno = 0;
+ si.si_addr = (void *)addr;
+#ifdef __ARCH_SI_TRAPNO
+ si.si_trapno = trapno;
+#endif
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
+ si.si_code = BUS_MCEERR_AR;
+ ret = force_sig_info(SIGBUS, &si, current);
+ } else {
+ /*
+ * Don't use force here, it's convenient if the signal
+ * can be temporarily blocked.
+ * This could cause a loop when the user sets SIGBUS
+ * to SIG_IGN, but hopefully no one will do that?
+ */
+ si.si_code = BUS_MCEERR_AO;
+ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
+ }
+ if (ret < 0)
+ printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
+ return ret;
+}
+
+/*
+ * When a unknown page type is encountered drain as many buffers as possible
+ * in the hope to turn the page into a LRU or free page, which we can handle.
+ */
+void shake_page(struct page *p, int access)
+{
+ if (!PageSlab(p)) {
+ lru_add_drain_all();
+ if (PageLRU(p))
+ return;
+ drain_all_pages();
+ if (PageLRU(p) || is_free_buddy_page(p))
+ return;
+ }
+
+ /*
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
+ */
+ if (access) {
+ int nr;
+ int nid = page_to_nid(p);
+ do {
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ node_set(nid, shrink.nodes_to_scan);
+
+ nr = shrink_slab(&shrink, 1000, 1000);
+ if (page_count(p) == 1)
+ break;
+ } while (nr > 10);
+ }
+}
+EXPORT_SYMBOL_GPL(shake_page);
+
+/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+ struct list_head nd;
+ struct task_struct *tsk;
+ unsigned long addr;
+ char addr_valid;
+};
+
+/*
+ * Failure handling: if we can't find or can't kill a process there's
+ * not much we can do. We just print a message and ignore otherwise.
+ */
+
+/*
+ * Schedule a process for later kill.
+ * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ * TBD would GFP_NOIO be enough?
+ */
+static void add_to_kill(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill,
+ struct to_kill **tkc)
+{
+ struct to_kill *tk;
+
+ if (*tkc) {
+ tk = *tkc;
+ *tkc = NULL;
+ } else {
+ tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
+ if (!tk) {
+ printk(KERN_ERR
+ "MCE: Out of memory while machine check handling\n");
+ return;
+ }
+ }
+ tk->addr = page_address_in_vma(p, vma);
+ tk->addr_valid = 1;
+
+ /*
+ * In theory we don't have to kill when the page was
+ * munmaped. But it could be also a mremap. Since that's
+ * likely very rare kill anyways just out of paranoia, but use
+ * a SIGKILL because the error is not contained anymore.
+ */
+ if (tk->addr == -EFAULT) {
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
+ page_to_pfn(p), tsk->comm);
+ tk->addr_valid = 0;
+ }
+ get_task_struct(tsk);
+ tk->tsk = tsk;
+ list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Kill the processes that have been collected earlier.
+ *
+ * Only do anything when DOIT is set, otherwise just free the list
+ * (this is used for clean pages which do not need killing)
+ * Also when FAIL is set do a force kill because something went
+ * wrong earlier.
+ */
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
+ int fail, struct page *page, unsigned long pfn,
+ int flags)
+{
+ struct to_kill *tk, *next;
+
+ list_for_each_entry_safe (tk, next, to_kill, nd) {
+ if (forcekill) {
+ /*
+ * In case something went wrong with munmapping
+ * make sure the process doesn't catch the
+ * signal and then access the memory. Just kill it.
+ */
+ if (fail || tk->addr_valid == 0) {
+ printk(KERN_ERR
+ "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ force_sig(SIGKILL, tk->tsk);
+ }
+
+ /*
+ * In theory the process could have mapped
+ * something else on the address in-between. We could
+ * check for that, but we need to tell the
+ * process anyways.
+ */
+ else if (kill_proc(tk->tsk, tk->addr, trapno,
+ pfn, page, flags) < 0)
+ printk(KERN_ERR
+ "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
+ }
+ put_task_struct(tk->tsk);
+ kfree(tk);
+ }
+}
+
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
+{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ struct task_struct *t;
+ if (!tsk->mm)
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
+}
+
+/*
+ * Collect processes when the error hit an anonymous page.
+ */
+static void collect_procs_anon(struct page *page, struct list_head *to_kill,
+ struct to_kill **tkc, int force_early)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct anon_vma *av;
+ pgoff_t pgoff;
+
+ av = page_lock_anon_vma_read(page);
+ if (av == NULL) /* Not actually mapped anymore */
+ return;
+
+ pgoff = page_to_pgoff(page);
+ read_lock(&tasklist_lock);
+ for_each_process (tsk) {
+ struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
+
+ if (!t)
+ continue;
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
+ vma = vmac->vma;
+ if (!page_mapped_in_vma(page, vma))
+ continue;
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ page_unlock_anon_vma_read(av);
+}
+
+/*
+ * Collect processes when the error hit a file mapped page.
+ */
+static void collect_procs_file(struct page *page, struct list_head *to_kill,
+ struct to_kill **tkc, int force_early)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct address_space *mapping = page->mapping;
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct task_struct *t = task_early_kill(tsk, force_early);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
+ pgoff) {
+ /*
+ * Send early kill signal to tasks where a vma covers
+ * the page but the corrupted page is not necessarily
+ * mapped it in its pte.
+ * Assume applications who requested early kill want
+ * to be informed of all such data corruptions.
+ */
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
+}
+
+/*
+ * Collect the processes who have the corrupted page mapped to kill.
+ * This is done in two steps for locking reasons.
+ * First preallocate one tokill structure outside the spin locks,
+ * so that we can kill at least one process reasonably reliable.
+ */
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
+{
+ struct to_kill *tk;
+
+ if (!page->mapping)
+ return;
+
+ tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
+ if (!tk)
+ return;
+ if (PageAnon(page))
+ collect_procs_anon(page, tokill, &tk, force_early);
+ else
+ collect_procs_file(page, tokill, &tk, force_early);
+ kfree(tk);
+}
+
+/*
+ * Error handlers for various types of pages.
+ */
+
+enum outcome {
+ IGNORED, /* Error: cannot be handled */
+ FAILED, /* Error: handling failed */
+ DELAYED, /* Will be handled later */
+ RECOVERED, /* Successfully recovered */
+};
+
+static const char *action_name[] = {
+ [IGNORED] = "Ignored",
+ [FAILED] = "Failed",
+ [DELAYED] = "Delayed",
+ [RECOVERED] = "Recovered",
+};
+
+/*
+ * XXX: It is possible that a page is isolated from LRU cache,
+ * and then kept in swap cache or failed to remove from page cache.
+ * The page count will stop it from being freed by unpoison.
+ * Stress tests should be aware of this memory leak problem.
+ */
+static int delete_from_lru_cache(struct page *p)
+{
+ if (!isolate_lru_page(p)) {
+ /*
+ * Clear sensible page flags, so that the buddy system won't
+ * complain when the page is unpoison-and-freed.
+ */
+ ClearPageActive(p);
+ ClearPageUnevictable(p);
+ /*
+ * drop the page count elevated by isolate_lru_page()
+ */
+ page_cache_release(p);
+ return 0;
+ }
+ return -EIO;
+}
+
+/*
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
+ */
+static int me_kernel(struct page *p, unsigned long pfn)
+{
+ return IGNORED;
+}
+
+/*
+ * Page in unknown state. Do nothing.
+ */
+static int me_unknown(struct page *p, unsigned long pfn)
+{
+ printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ return FAILED;
+}
+
+/*
+ * Clean (or cleaned) page cache page.
+ */
+static int me_pagecache_clean(struct page *p, unsigned long pfn)
+{
+ int err;
+ int ret = FAILED;
+ struct address_space *mapping;
+
+ delete_from_lru_cache(p);
+
+ /*
+ * For anonymous pages we're done the only reference left
+ * should be the one m_f() holds.
+ */
+ if (PageAnon(p))
+ return RECOVERED;
+
+ /*
+ * Now truncate the page in the page cache. This is really
+ * more like a "temporary hole punch"
+ * Don't do this for block devices when someone else
+ * has a reference, because it could be file system metadata
+ * and that's not safe to truncate.
+ */
+ mapping = page_mapping(p);
+ if (!mapping) {
+ /*
+ * Page has been teared down in the meanwhile
+ */
+ return FAILED;
+ }
+
+ /*
+ * Truncation is a bit tricky. Enable it per file system for now.
+ *
+ * Open: to take i_mutex or not for this? Right now we don't.
+ */
+ if (mapping->a_ops->error_remove_page) {
+ err = mapping->a_ops->error_remove_page(mapping, p);
+ if (err != 0) {
+ printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
+ } else if (page_has_private(p) &&
+ !try_to_release_page(p, GFP_NOIO)) {
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
+ } else {
+ ret = RECOVERED;
+ }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+ * This fails on dirty or anything with private pages
+ */
+ if (invalidate_inode_page(p))
+ ret = RECOVERED;
+ else
+ printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
+ pfn);
+ }
+ return ret;
+}
+
+/*
+ * Dirty pagecache page
+ * Issues: when the error hit a hole page the error is not properly
+ * propagated.
+ */
+static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+{
+ struct address_space *mapping = page_mapping(p);
+
+ SetPageError(p);
+ /* TBD: print more information about the file. */
+ if (mapping) {
+ /*
+ * IO error will be reported by write(), fsync(), etc.
+ * who check the mapping.
+ * This way the application knows that something went
+ * wrong with its dirty file data.
+ *
+ * There's one open issue:
+ *
+ * The EIO will be only reported on the next IO
+ * operation and then cleared through the IO map.
+ * Normally Linux has two mechanisms to pass IO error
+ * first through the AS_EIO flag in the address space
+ * and then through the PageError flag in the page.
+ * Since we drop pages on memory failure handling the
+ * only mechanism open to use is through AS_AIO.
+ *
+ * This has the disadvantage that it gets cleared on
+ * the first operation that returns an error, while
+ * the PageError bit is more sticky and only cleared
+ * when the page is reread or dropped. If an
+ * application assumes it will always get error on
+ * fsync, but does other operations on the fd before
+ * and the page is dropped between then the error
+ * will not be properly reported.
+ *
+ * This can already happen even without hwpoisoned
+ * pages: first on metadata IO errors (which only
+ * report through AS_EIO) or when the page is dropped
+ * at the wrong time.
+ *
+ * So right now we assume that the application DTRT on
+ * the first EIO, but we're not worse than other parts
+ * of the kernel.
+ */
+ mapping_set_error(mapping, EIO);
+ }
+
+ return me_pagecache_clean(p, pfn);
+}
+
+/*
+ * Clean and dirty swap cache.
+ *
+ * Dirty swap cache page is tricky to handle. The page could live both in page
+ * cache and swap cache(ie. page is freshly swapped in). So it could be
+ * referenced concurrently by 2 types of PTEs:
+ * normal PTEs and swap PTEs. We try to handle them consistently by calling
+ * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
+ * and then
+ * - clear dirty bit to prevent IO
+ * - remove from LRU
+ * - but keep in the swap cache, so that when we return to it on
+ * a later page fault, we know the application is accessing
+ * corrupted data and shall be killed (we installed simple
+ * interception code in do_swap_page to catch it).
+ *
+ * Clean swap cache pages can be directly isolated. A later page fault will
+ * bring in the known good data from disk.
+ */
+static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+{
+ ClearPageDirty(p);
+ /* Trigger EIO in shmem: */
+ ClearPageUptodate(p);
+
+ if (!delete_from_lru_cache(p))
+ return DELAYED;
+ else
+ return FAILED;
+}
+
+static int me_swapcache_clean(struct page *p, unsigned long pfn)
+{
+ delete_from_swap_cache(p);
+
+ if (!delete_from_lru_cache(p))
+ return RECOVERED;
+ else
+ return FAILED;
+}
+
+/*
+ * Huge pages. Needs work.
+ * Issues:
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
+ */
+static int me_huge_page(struct page *p, unsigned long pfn)
+{
+ int res = 0;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
+ }
+ return DELAYED;
+}
+
+/*
+ * Various page states we can handle.
+ *
+ * A page state is defined by its current page->flags bits.
+ * The table matches them in order and calls the right handler.
+ *
+ * This is quite tricky because we can access page at any time
+ * in its live cycle, so all accesses have to be extremely careful.
+ *
+ * This is not complete. More states could be added.
+ * For any missing state don't attempt recovery.
+ */
+
+#define dirty (1UL << PG_dirty)
+#define sc (1UL << PG_swapcache)
+#define unevict (1UL << PG_unevictable)
+#define mlock (1UL << PG_mlocked)
+#define writeback (1UL << PG_writeback)
+#define lru (1UL << PG_lru)
+#define swapbacked (1UL << PG_swapbacked)
+#define head (1UL << PG_head)
+#define tail (1UL << PG_tail)
+#define compound (1UL << PG_compound)
+#define slab (1UL << PG_slab)
+#define reserved (1UL << PG_reserved)
+
+static struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ char *msg;
+ int (*action)(struct page *p, unsigned long pfn);
+} error_states[] = {
+ { reserved, reserved, "reserved kernel", me_kernel },
+ /*
+ * free pages are specially detected outside this table:
+ * PG_buddy pages only make a small fraction of all free pages.
+ */
+
+ /*
+ * Could in theory check if slab page is free or if we can drop
+ * currently unused objects without touching them. But just
+ * treat it as standard kernel for now.
+ */
+ { slab, slab, "kernel slab", me_kernel },
+
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+ { head, head, "huge", me_huge_page },
+ { tail, tail, "huge", me_huge_page },
+#else
+ { compound, compound, "huge", me_huge_page },
+#endif
+
+ { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
+
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
+
+ { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+ { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+
+ { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
+ { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+
+ /*
+ * Catchall entry: must be at end.
+ */
+ { 0, 0, "unknown page state", me_unknown },
+};
+
+#undef dirty
+#undef sc
+#undef unevict
+#undef mlock
+#undef writeback
+#undef lru
+#undef swapbacked
+#undef head
+#undef tail
+#undef compound
+#undef slab
+#undef reserved
+
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
+static void action_result(unsigned long pfn, char *msg, int result)
+{
+ pr_err("MCE %#lx: %s page recovery: %s\n",
+ pfn, msg, action_name[result]);
+}
+
+static int page_action(struct page_state *ps, struct page *p,
+ unsigned long pfn)
+{
+ int result;
+ int count;
+
+ result = ps->action(p, pfn);
+ action_result(pfn, ps->msg, result);
+
+ count = page_count(p) - 1;
+ if (ps->action == me_swapcache_dirty && result == DELAYED)
+ count--;
+ if (count != 0) {
+ printk(KERN_ERR
+ "MCE %#lx: %s page still referenced by %d users\n",
+ pfn, ps->msg, count);
+ result = FAILED;
+ }
+
+ /* Could do more checks here if page looks ok */
+ /*
+ * Could adjust zone counters here to correct for the missing page.
+ */
+
+ return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+}
+
+/*
+ * Do all that is necessary to remove user space mappings. Unmap
+ * the pages and send SIGBUS to the processes if the data was dirty.
+ */
+static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+ int trapno, int flags, struct page **hpagep)
+{
+ enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ struct address_space *mapping;
+ LIST_HEAD(tokill);
+ int ret;
+ int kill = 1, forcekill;
+ struct page *hpage = *hpagep;
+ struct page *ppage;
+
+ /*
+ * Here we are interested only in user-mapped pages, so skip any
+ * other types of pages.
+ */
+ if (PageReserved(p) || PageSlab(p))
+ return SWAP_SUCCESS;
+ if (!(PageLRU(hpage) || PageHuge(p)))
+ return SWAP_SUCCESS;
+
+ /*
+ * This check implies we don't kill processes if their pages
+ * are in the swap cache early. Those are always late kills.
+ */
+ if (!page_mapped(hpage))
+ return SWAP_SUCCESS;
+
+ if (PageKsm(p)) {
+ pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
+ return SWAP_FAIL;
+ }
+
+ if (PageSwapCache(p)) {
+ printk(KERN_ERR
+ "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu |= TTU_IGNORE_HWPOISON;
+ }
+
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = page_mapping(hpage);
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
+ } else {
+ kill = 0;
+ ttu |= TTU_IGNORE_HWPOISON;
+ printk(KERN_INFO
+ "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ /*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
+ if (PageTransHuge(hpage)) {
+ /*
+ * Verify that this isn't a hugetlbfs head page, the check for
+ * PageAnon is just for avoid tripping a split_huge_page
+ * internal debug check, as split_huge_page refuses to deal with
+ * anything that isn't an anon page. PageAnon can't go away fro
+ * under us because we hold a refcount on the hpage, without a
+ * refcount on the hpage. split_huge_page can't be safely called
+ * in the first place, having a refcount on the tail isn't
+ * enough * to be safe.
+ */
+ if (!PageHuge(hpage) && PageAnon(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ /*
+ * FIXME: if splitting THP is failed, it is
+ * better to stop the following operation rather
+ * than causing panic by unmapping. System might
+ * survive if the page is freed later.
+ */
+ printk(KERN_INFO
+ "MCE %#lx: failed to split THP\n", pfn);
+
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
+ }
+
+ /*
+ * First collect all the processes that have the page
+ * mapped in dirty form. This has to be done before try_to_unmap,
+ * because ttu takes the rmap data structures down.
+ *
+ * Error handling: We ignore errors here because
+ * there's nothing that can be done.
+ */
+ if (kill)
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+
+ ret = try_to_unmap(ppage, ttu);
+ if (ret != SWAP_SUCCESS)
+ printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(ppage));
+
+ /*
+ * Now that the dirty bit has been propagated to the
+ * struct page and all unmaps done we can decide if
+ * killing is needed or not. Only kill when the page
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
+ * freed. When there was a problem unmapping earlier
+ * use a more force-full uncatchable kill to prevent
+ * any accesses to the poisoned memory.
+ */
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
+ ret != SWAP_SUCCESS, p, pfn, flags);
+
+ return ret;
+}
+
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
+/**
+ * memory_failure - Handle memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: fine tune action taken
+ *
+ * This function is called by the low level machine check code
+ * of an architecture when it detects hardware memory corruption
+ * of a page. It tries its best to recover, which includes
+ * dropping pages, killing processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Must run in process context (e.g. a work queue) with interrupts
+ * enabled and no spinlocks hold.
+ */
+int memory_failure(unsigned long pfn, int trapno, int flags)
+{
+ struct page_state *ps;
+ struct page *p;
+ struct page *hpage;
+ int res;
+ unsigned int nr_pages;
+ unsigned long page_flags;
+
+ if (!sysctl_memory_failure_recovery)
+ panic("Memory failure from trap %d on page %lx", trapno, pfn);
+
+ if (!pfn_valid(pfn)) {
+ printk(KERN_ERR
+ "MCE %#lx: memory outside kernel control\n",
+ pfn);
+ return -ENXIO;
+ }
+
+ p = pfn_to_page(pfn);
+ hpage = compound_head(p);
+ if (TestSetPageHWPoison(p)) {
+ printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
+ return 0;
+ }
+
+ /*
+ * Currently errors on hugetlbfs pages are measured in hugepage units,
+ * so nr_pages should be 1 << compound_order. OTOH when errors are on
+ * transparent hugepages, they are supposed to be split and error
+ * measurement is done in normal page units. So nr_pages should be one
+ * in this case.
+ */
+ if (PageHuge(p))
+ nr_pages = 1 << compound_order(hpage);
+ else /* normal page or thp */
+ nr_pages = 1;
+ atomic_long_add(nr_pages, &num_poisoned_pages);
+
+ /*
+ * We need/can do nothing about count=0 pages.
+ * 1) it's a free page, and therefore in safe hand:
+ * prep_new_page() will be the gate keeper.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
+ * Implies some kernel user: cannot stop them from
+ * R/W the page; let's pray that the page has been
+ * used and will be freed some time later.
+ * In fact it's dangerous to directly bump up page count from 0,
+ * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ */
+ if (!(flags & MF_COUNT_INCREASED) &&
+ !get_page_unless_zero(hpage)) {
+ if (is_free_buddy_page(p)) {
+ action_result(pfn, "free buddy", DELAYED);
+ return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "filter hit" and "race with other subpage."
+ */
+ lock_page(hpage);
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
+ } else {
+ action_result(pfn, "high order kernel", IGNORED);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * We ignore non-LRU pages for good reasons.
+ * - PG_locked is only well defined for LRU pages and a few others
+ * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
+ * The check (unnecessarily) ignores LRU pages being isolated and
+ * walked by the page reclaim code, however that's not a big loss.
+ */
+ if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, "free buddy", DELAYED);
+ else
+ action_result(pfn, "free buddy, 2nd try", DELAYED);
+ return 0;
+ }
+ }
+ }
+
+ lock_page(hpage);
+
+ /*
+ * We use page flags to determine what action should be taken, but
+ * the flags can be modified by the error containment action. One
+ * example is an mlocked page, where PG_mlocked is cleared by
+ * page_remove_rmap() in try_to_unmap_one(). So to determine page status
+ * correctly, we save a copy of the page flags at this time.
+ */
+ page_flags = p->flags;
+
+ /*
+ * unpoison always clear PG_hwpoison inside page lock
+ */
+ if (!PageHWPoison(p)) {
+ printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
+ res = 0;
+ goto out;
+ }
+ if (hwpoison_filter(p)) {
+ if (TestClearPageHWPoison(p))
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+
+ if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ goto identify_page_state;
+
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
+
+ /*
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
+ wait_on_page_writeback(p);
+
+ /*
+ * Now take care of user space mappings.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
+ */
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
+ action_result(pfn, "unmapping failed", IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * Torn down by someone else?
+ */
+ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+ action_result(pfn, "already truncated LRU", IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
+identify_page_state:
+ res = -EBUSY;
+ /*
+ * The first check uses the current page flags which may not have any
+ * relevant information. The second check with the saved page flagss is
+ * carried out only if the first check can't determine the page status.
+ */
+ for (ps = error_states;; ps++)
+ if ((p->flags & ps->mask) == ps->res)
+ break;
+
+ page_flags |= (p->flags & (1UL << PG_dirty));
+
+ if (!ps->mask)
+ for (ps = error_states;; ps++)
+ if ((page_flags & ps->mask) == ps->res)
+ break;
+ res = page_action(ps, p, pfn);
+out:
+ unlock_page(hpage);
+ return res;
+}
+EXPORT_SYMBOL_GPL(memory_failure);
+
+#define MEMORY_FAILURE_FIFO_ORDER 4
+#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
+
+struct memory_failure_entry {
+ unsigned long pfn;
+ int trapno;
+ int flags;
+};
+
+struct memory_failure_cpu {
+ DECLARE_KFIFO(fifo, struct memory_failure_entry,
+ MEMORY_FAILURE_FIFO_SIZE);
+ spinlock_t lock;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+ struct memory_failure_cpu *mf_cpu;
+ unsigned long proc_flags;
+ struct memory_failure_entry entry = {
+ .pfn = pfn,
+ .trapno = trapno,
+ .flags = flags,
+ };
+
+ mf_cpu = &get_cpu_var(memory_failure_cpu);
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ if (kfifo_put(&mf_cpu->fifo, entry))
+ schedule_work_on(smp_processor_id(), &mf_cpu->work);
+ else
+ pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pfn);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+
+static void memory_failure_work_func(struct work_struct *work)
+{
+ struct memory_failure_cpu *mf_cpu;
+ struct memory_failure_entry entry = { 0, };
+ unsigned long proc_flags;
+ int gotten;
+
+ mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ for (;;) {
+ spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+ gotten = kfifo_get(&mf_cpu->fifo, &entry);
+ spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+ if (!gotten)
+ break;
+ if (entry.flags & MF_SOFT_OFFLINE)
+ soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ else
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
+ }
+}
+
+static int __init memory_failure_init(void)
+{
+ struct memory_failure_cpu *mf_cpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ spin_lock_init(&mf_cpu->lock);
+ INIT_KFIFO(mf_cpu->fifo);
+ INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+ }
+
+ return 0;
+}
+core_initcall(memory_failure_init);
+
+/**
+ * unpoison_memory - Unpoison a previously poisoned page
+ * @pfn: Page number of the to be unpoisoned page
+ *
+ * Software-unpoison a page that has been poisoned by
+ * memory_failure() earlier.
+ *
+ * This is only done on the software-level, so it only works
+ * for linux injected failures, not real hardware failures
+ *
+ * Returns 0 for success, otherwise -errno.
+ */
+int unpoison_memory(unsigned long pfn)
+{
+ struct page *page;
+ struct page *p;
+ int freeit = 0;
+ unsigned int nr_pages;
+
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+
+ p = pfn_to_page(pfn);
+ page = compound_head(p);
+
+ if (!PageHWPoison(p)) {
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+ return 0;
+ }
+
+ /*
+ * unpoison_memory() can encounter thp only when the thp is being
+ * worked by memory_failure() and the page lock is not held yet.
+ * In such case, we yield to memory_failure() and make unpoison fail.
+ */
+ if (!PageHuge(page) && PageTransHuge(page)) {
+ pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+ return 0;
+ }
+
+ nr_pages = 1 << compound_order(page);
+
+ if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
+ if (TestClearPageHWPoison(p))
+ atomic_long_dec(&num_poisoned_pages);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ return 0;
+ }
+
+ lock_page(page);
+ /*
+ * This test is racy because PG_hwpoison is set outside of page lock.
+ * That's acceptable because that won't trigger kernel panic. Instead,
+ * the PG_hwpoison page will be caught and isolated on the entrance to
+ * the free buddy page pool.
+ */
+ if (TestClearPageHWPoison(page)) {
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
+ }
+ unlock_page(page);
+
+ put_page(page);
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
+ put_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL(unpoison_memory);
+
+static struct page *new_page(struct page *p, unsigned long private, int **x)
+{
+ int nid = page_to_nid(p);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+}
+
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, -EIO for a zero refcount page
+ * that is not free, and 1 for any other page type.
+ * For 1 the page is returned with increased page count, otherwise not.
+ */
+static int __get_any_page(struct page *p, unsigned long pfn, int flags)
+{
+ int ret;
+
+ if (flags & MF_COUNT_INCREASED)
+ return 1;
+
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
+ if (!get_page_unless_zero(compound_head(p))) {
+ if (PageHuge(p)) {
+ pr_info("%s: %#lx free huge page\n", __func__, pfn);
+ ret = 0;
+ } else if (is_free_buddy_page(p)) {
+ pr_info("%s: %#lx free buddy page\n", __func__, pfn);
+ ret = 0;
+ } else {
+ pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
+ __func__, pfn, p->flags);
+ ret = -EIO;
+ }
+ } else {
+ /* Not a free page */
+ ret = 1;
+ }
+ return ret;
+}
+
+static int get_any_page(struct page *page, unsigned long pfn, int flags)
+{
+ int ret = __get_any_page(page, pfn, flags);
+
+ if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
+ /*
+ * Try to free it.
+ */
+ put_page(page);
+ shake_page(page, 1);
+
+ /*
+ * Did it turn free?
+ */
+ ret = __get_any_page(page, pfn, 0);
+ if (!PageLRU(page)) {
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pfn, page->flags);
+ return -EIO;
+ }
+ }
+ return ret;
+}
+
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ /*
+ * This double-check of PageHWPoison is to avoid the race with
+ * memory_failure(). See also comment in __soft_offline_page().
+ */
+ lock_page(hpage);
+ if (PageHWPoison(hpage)) {
+ unlock_page(hpage);
+ put_page(hpage);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+ unlock_page(hpage);
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+ list_move(&hpage->lru, &pagelist);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (ret) {
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ /*
+ * We know that soft_offline_huge_page() tries to migrate
+ * only one hugepage pointed to by hpage, so we need not
+ * run through the pagelist here.
+ */
+ putback_active_hugepage(hpage);
+ if (ret > 0)
+ ret = -EIO;
+ } else {
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ }
+ return ret;
+}
+
+static int __soft_offline_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+
+ /*
+ * Check PageHWPoison again inside page lock because PageHWPoison
+ * is set by memory_failure() outside page lock. Note that
+ * memory_failure() also double-checks PageHWPoison inside page lock,
+ * so there's no race between soft_offline_page() and memory_failure().
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
+ if (PageHWPoison(page)) {
+ unlock_page(page);
+ put_page(page);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ return -EBUSY;
+ }
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
+ unlock_page(page);
+ /*
+ * RED-PEN would be better to keep it isolated here, but we
+ * would need to fix isolation locking first.
+ */
+ if (ret == 1) {
+ put_page(page);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ return 0;
+ }
+
+ /*
+ * Simple invalidation didn't work.
+ * Try to migrate to a new page instead. migrate.c
+ * handles a large number of cases for us.
+ */
+ ret = isolate_lru_page(page);
+ /*
+ * Drop page reference which is came from get_any_page()
+ * successful isolate_lru_page() already took another one.
+ */
+ put_page(page);
+ if (!ret) {
+ LIST_HEAD(pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ list_add(&page->lru, &pagelist);
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
+ MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (ret) {
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ } else {
+ /*
+ * After page migration succeeds, the source page can
+ * be trapped in pagevec and actual freeing is delayed.
+ * Freeing code works differently based on PG_hwpoison,
+ * so there's a race. We need to make sure that the
+ * source page should be freed back to buddy before
+ * setting PG_hwpoison.
+ */
+ if (!is_free_buddy_page(page))
+ lru_add_drain_all();
+ if (!is_free_buddy_page(page))
+ drain_all_pages();
+ SetPageHWPoison(page);
+ if (!is_free_buddy_page(page))
+ pr_info("soft offline: %#lx: page leaked\n",
+ pfn);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ } else {
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pfn, ret, page_count(page), page->flags);
+ }
+ return ret;
+}
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+
+ if (PageHWPoison(page)) {
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ return -EBUSY;
+ }
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
+
+ get_online_mems();
+
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
+ */
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(page, true);
+
+ ret = get_any_page(page, pfn, flags);
+ put_online_mems();
+ if (ret > 0) { /* for in-use pages */
+ if (PageHuge(page))
+ ret = soft_offline_huge_page(page, flags);
+ else
+ ret = __soft_offline_page(page, flags);
+ } else if (ret == 0) { /* for free pages */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
+ }
+ unset_migratetype_isolate(page, MIGRATE_MOVABLE);
+ return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
index 601159a46ab..8b44f765b64 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,20 +45,35 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/swapops.h>
+#include <linux/elf.h>
+#include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
+#include <linux/dma-debug.h>
+#include <linux/debugfs.h>
+#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-#include <linux/swapops.h>
-#include <linux/elf.h>
+#include "internal.h"
+
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
+#endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
/* use the per-pgdat data instead for discontigmem - mbligh */
@@ -69,7 +84,6 @@ EXPORT_SYMBOL(max_mapnr);
EXPORT_SYMBOL(mem_map);
#endif
-unsigned long num_physpages;
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -78,13 +92,21 @@ unsigned long num_physpages;
* and ZONE_HIGHMEM.
*/
void * high_memory;
-unsigned long vmalloc_earlyreserve;
-EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
-int randomize_va_space __read_mostly = 1;
+/*
+ * Randomize the address space (stacks, mmaps, brk, etc.).
+ *
+ * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
+ * as ancient (libc5 based) binaries can segfault. )
+ */
+int randomize_va_space __read_mostly =
+#ifdef CONFIG_COMPAT_BRK
+ 1;
+#else
+ 2;
+#endif
static int __init disable_randmaps(char *s)
{
@@ -93,43 +115,285 @@ static int __init disable_randmaps(char *s)
}
__setup("norandmaps", disable_randmaps);
+unsigned long zero_pfn __read_mostly;
+unsigned long highest_memmap_pfn __read_mostly;
/*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none. Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
+static int __init init_zero_pfn(void)
+{
+ zero_pfn = page_to_pfn(ZERO_PAGE(0));
+ return 0;
+}
+core_initcall(init_zero_pfn);
+
+
+#if defined(SPLIT_RSS_COUNTING)
-void pgd_clear_bad(pgd_t *pgd)
+void sync_mm_rss(struct mm_struct *mm)
{
- pgd_ERROR(*pgd);
- pgd_clear(pgd);
+ int i;
+
+ for (i = 0; i < NR_MM_COUNTERS; i++) {
+ if (current->rss_stat.count[i]) {
+ add_mm_counter(mm, i, current->rss_stat.count[i]);
+ current->rss_stat.count[i] = 0;
+ }
+ }
+ current->rss_stat.events = 0;
}
-void pud_clear_bad(pud_t *pud)
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
{
- pud_ERROR(*pud);
- pud_clear(pud);
+ struct task_struct *task = current;
+
+ if (likely(task->mm == mm))
+ task->rss_stat.count[member] += val;
+ else
+ add_mm_counter(mm, member, val);
}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-void pmd_clear_bad(pmd_t *pmd)
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH (64)
+static void check_sync_rss_stat(struct task_struct *task)
{
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
+ if (unlikely(task != current))
+ return;
+ if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+ sync_mm_rss(task->mm);
}
+#else /* SPLIT_RSS_COUNTING */
+
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+
+#endif /* SPLIT_RSS_COUNTING */
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+
+static int tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return 1;
+ }
+
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return 0;
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return 0;
+
+ tlb->batch_count++;
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return 1;
+}
+
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ tlb->mm = mm;
+
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
+ tlb->need_flush_all = 0;
+ tlb->start = start;
+ tlb->end = end;
+ tlb->need_flush = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
+#endif
+}
+
+static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+{
+ tlb->need_flush = 0;
+ tlb_flush(tlb);
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
+#endif
+}
+
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ if (!tlb->need_flush)
+ return;
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
+/* tlb_finish_mmu
+ * Called at the end of the shootdown operation to free up any resources
+ * that were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ tlb_flush_mmu(tlb);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+/* __tlb_remove_page
+ * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ * handling the additional races in SMP caused by other CPUs caching valid
+ * mappings in their TLBs. Returns the number of free page slots left.
+ * When out of page slots we must call tlb_flush_mmu().
+ */
+int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+ struct mmu_gather_batch *batch;
+
+ VM_BUG_ON(!tlb->need_flush);
+
+ batch = tlb->active;
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return 0;
+ batch = tlb->active;
+ }
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+
+ return batch->max - batch->nr;
+}
+
+#endif /* HAVE_GENERIC_MMU_GATHER */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ tlb->need_flush = 1;
+
+ /*
+ * When there's less then two users of this mm there cannot be a
+ * concurrent page-table walk.
+ */
+ if (atomic_read(&tlb->mm->mm_users) < 2) {
+ __tlb_remove_table(table);
+ return;
+ }
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
-static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long addr)
{
- struct page *page = pmd_page(*pmd);
+ pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
- pte_lock_deinit(page);
- pte_free_tlb(tlb, page);
- dec_zone_page_state(page, NR_PAGETABLE);
- tlb->mm->nr_ptes--;
+ pte_free_tlb(tlb, token, addr);
+ atomic_long_dec(&tlb->mm->nr_ptes);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -146,7 +410,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- free_pte_range(tlb, pmd);
+ free_pte_range(tlb, pmd, addr);
} while (pmd++, addr = next, addr != end);
start &= PUD_MASK;
@@ -162,7 +426,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
- pmd_free_tlb(tlb, pmd);
+ pmd_free_tlb(tlb, pmd, start);
}
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -195,21 +459,18 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud = pud_offset(pgd, start);
pgd_clear(pgd);
- pud_free_tlb(tlb, pud);
+ pud_free_tlb(tlb, pud, start);
}
/*
* This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
*/
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
* The next few lines have given us lots of grief...
@@ -253,20 +514,16 @@ void free_pgd_range(struct mmu_gather **tlb,
if (addr > end - 1)
return;
- start = addr;
- pgd = pgd_offset((*tlb)->mm, addr);
+ pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+ free_pud_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
-
- if (!(*tlb)->fullmm)
- flush_tlb_pgtables((*tlb)->mm, start, end);
}
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
@@ -274,9 +531,10 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
unsigned long addr = vma->vm_start;
/*
- * Hide vma from rmap and vmtruncate before freeing pgtables
+ * Hide vma from rmap and truncate_pagecache before freeing
+ * pgtables
*/
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
@@ -290,7 +548,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
- anon_vma_unlink(vma);
+ unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
@@ -300,23 +558,43 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address)
{
- struct page *new = pte_alloc_one(mm, address);
+ spinlock_t *ptl;
+ pgtable_t new = pte_alloc_one(mm, address);
+ int wait_split_huge_page;
if (!new)
return -ENOMEM;
- pte_lock_init(new);
- spin_lock(&mm->page_table_lock);
- if (pmd_present(*pmd)) { /* Another has populated it */
- pte_lock_deinit(new);
- pte_free(new);
- } else {
- mm->nr_ptes++;
- inc_zone_page_state(new, NR_PAGETABLE);
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_read_barrier_depends() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+
+ ptl = pmd_lock(mm, pmd);
+ wait_split_huge_page = 0;
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
- }
- spin_unlock(&mm->page_table_lock);
+ new = NULL;
+ } else if (unlikely(pmd_trans_splitting(*pmd)))
+ wait_split_huge_page = 1;
+ spin_unlock(ptl);
+ if (new)
+ pte_free(mm, new);
+ if (wait_split_huge_page)
+ wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -326,21 +604,34 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&init_mm.page_table_lock);
- if (pmd_present(*pmd)) /* Another has populated it */
- pte_free_kernel(new);
- else
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
+ new = NULL;
+ } else
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
+ if (new)
+ pte_free_kernel(&init_mm, new);
return 0;
}
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void init_rss_vec(int *rss)
+{
+ memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
+}
+
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
- if (file_rss)
- add_mm_counter(mm, file_rss, file_rss);
- if (anon_rss)
- add_mm_counter(mm, anon_rss, anon_rss);
+ int i;
+
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (rss[i])
+ add_mm_counter(mm, i, rss[i]);
}
/*
@@ -350,71 +641,156 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
*
* The calling function must still handle the error.
*/
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
-{
- printk(KERN_ERR "Bad pte = %08llx, process = %s, "
- "vm_flags = %lx, vaddr = %lx\n",
- (long long)pte_val(pte),
- (vma->vm_mm == current->mm ? current->comm : "???"),
- vma->vm_flags, vaddr);
- dump_stack();
-}
+static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, struct page *page)
+{
+ pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+ struct address_space *mapping;
+ pgoff_t index;
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
-static inline int is_cow_mapping(unsigned int flags)
-{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ return;
+ }
+ if (nr_unshown) {
+ printk(KERN_ALERT
+ "BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
+ index = linear_page_index(vma, addr);
+
+ printk(KERN_ALERT
+ "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ if (page)
+ dump_page(page, "bad pte");
+ printk(KERN_ALERT
+ "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ /*
+ * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
+ */
+ if (vma->vm_ops)
+ printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
+ vma->vm_ops->fault);
+ if (vma->vm_file)
+ printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
+ vma->vm_file->f_op->mmap);
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
- * This function gets the "struct page" associated with a pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ *
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
*
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
- * will have each page table entry just pointing to a raw page frame
- * number, and as far as the VM layer is concerned, those do not have
- * pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
*
- * The way we recognize those mappings is through the rules set up
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
- * and the vm_pgoff will point to the first PFN mapped: thus every
- * page that is a raw mapping will always honor the rule
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
*
* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
*
- * and if that isn't true, the page has been COW'ed (in which case it
- * _does_ have a "struct page" associated with it even if it is in a
- * VM_PFNMAP range).
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
*/
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
- unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
- if (pfn == vma->vm_pgoff + off)
- return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (HAVE_PTE_SPECIAL) {
+ if (likely(!pte_special(pte) || pte_numa(pte)))
+ goto check_pfn;
+ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
+ if (!is_zero_pfn(pfn))
+ print_bad_pte(vma, addr, pte, NULL);
+ return NULL;
}
- /*
- * Add some anal sanity checks for now. Eventually,
- * we should just do "return pfn_to_page(pfn)", but
- * in the meantime we check that we get a valid pfn,
- * and that the resulting page looks ok.
- */
- if (unlikely(!pfn_valid(pfn))) {
- print_bad_pte(vma, pte, addr);
+ /* !HAVE_PTE_SPECIAL case follows: */
+
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+check_pfn:
+ if (unlikely(pfn > highest_memmap_pfn)) {
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
+ if (is_zero_pfn(pfn))
+ return NULL;
+
/*
- * NOTE! We still have PageReserved() pages in the page
- * tables.
- *
- * The PAGE_ZERO() pages and various VDSO mappings can
- * cause them to exist.
+ * NOTE! We still have PageReserved() pages in the page tables.
+ * eg. VDSO mappings can cause them to exist.
*/
+out:
return pfn_to_page(pfn);
}
@@ -424,7 +800,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
* covered by this vma.
*/
-static inline void
+static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
@@ -438,7 +814,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!pte_file(pte)) {
swp_entry_t entry = pte_to_swp_entry(pte);
- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
@@ -447,15 +825,28 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
- if (is_write_migration_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- /*
- * COW mappings require pages in both parent
- * and child to be set to read.
- */
- make_migration_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- set_pte_at(src_mm, addr, src_pte, pte);
+ if (likely(!non_swap_entry(entry)))
+ rss[MM_SWAPENTS]++;
+ else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
+
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
}
goto out_set_pte;
@@ -467,7 +858,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_cow_mapping(vm_flags)) {
ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = *src_pte;
+ pte = pte_wrprotect(pte);
}
/*
@@ -482,30 +873,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (page) {
get_page(page);
page_dup_rmap(page);
- rss[!!PageAnon(page)]++;
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
}
out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
+ pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[NR_MM_COUNTERS];
+ swp_entry_t entry = (swp_entry_t){0};
again:
- rss[1] = rss[0] = 0;
+ init_rss_vec(rss);
+
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
- src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ orig_src_pte = src_pte;
+ orig_dst_pte = dst_pte;
+ arch_enter_lazy_mmu_mode();
do {
/*
@@ -515,23 +916,32 @@ again:
if (progress >= 32) {
progress = 0;
if (need_resched() ||
- need_lockbreak(src_ptl) ||
- need_lockbreak(dst_ptl))
+ spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
break;
}
if (pte_none(*src_pte)) {
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+ vma, addr, rss);
+ if (entry.val)
+ break;
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(src_pte - 1);
- add_mm_rss(dst_mm, rss[0], rss[1]);
- pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ pte_unmap(orig_src_pte);
+ add_mm_rss_vec(dst_mm, rss);
+ pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
+
+ if (entry.val) {
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ progress = 0;
+ }
if (addr != end)
goto again;
return 0;
@@ -550,6 +960,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*src_pmd)) {
+ int err;
+ VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ err = copy_huge_pmd(dst_mm, src_mm,
+ dst_pmd, src_pmd, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -588,6 +1009,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ bool is_cow;
+ int ret;
/*
* Don't copy ptes where a page fault will fill them correctly.
@@ -595,7 +1020,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+ if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+ VM_PFNMAP | VM_MIXEDMAP))) {
if (!vma->anon_vma)
return 0;
}
@@ -603,40 +1029,71 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+ /*
+ * We do not free on error cases below as remove_vma
+ * gets called on error from higher level routine
+ */
+ ret = track_pfn_copy(vma);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * We need to invalidate the secondary MMU mappings only when
+ * there could be a permission downgrade on the ptes of the
+ * parent mm. And a permission downgrade will only happen if
+ * is_cow_mapping() returns true.
+ */
+ is_cow = is_cow_mapping(vma->vm_flags);
+ mmun_start = addr;
+ mmun_end = end;
+ if (is_cow)
+ mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+ mmun_end);
+
+ ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, addr, next))
- return -ENOMEM;
+ if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+ vma, addr, next))) {
+ ret = -ENOMEM;
+ break;
+ }
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- return 0;
+
+ if (is_cow)
+ mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+ return ret;
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- pte_t *pte;
+ int force_flush = 0;
+ int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
- int file_rss = 0;
- int anon_rss = 0;
+ pte_t *start_pte;
+ pte_t *pte;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+again:
+ init_rss_vec(rss);
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = start_pte;
+ arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
if (pte_none(ptent)) {
- (*zap_work)--;
continue;
}
- (*zap_work) -= PAGE_SIZE;
-
if (pte_present(ptent)) {
struct page *page;
@@ -666,20 +1123,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
- addr) != page->index)
- set_pte_at(mm, addr, pte,
- pgoff_to_pte(page->index));
+ addr) != page->index) {
+ pte_t ptfile = pgoff_to_pte(page->index);
+ if (pte_soft_dirty(ptent))
+ pte_file_mksoft_dirty(ptfile);
+ set_pte_at(mm, addr, pte, ptfile);
+ }
if (PageAnon(page))
- anon_rss--;
+ rss[MM_ANONPAGES]--;
else {
- if (pte_dirty(ptent))
+ if (pte_dirty(ptent)) {
+ force_flush = 1;
set_page_dirty(page);
- if (pte_young(ptent))
+ }
+ if (pte_young(ptent) &&
+ likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
- file_rss--;
+ rss[MM_FILEPAGES]--;
}
page_remove_rmap(page);
- tlb_remove_page(tlb, page);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ if (unlikely(!__tlb_remove_page(tlb, page))) {
+ force_flush = 1;
+ break;
+ }
continue;
}
/*
@@ -688,13 +1156,63 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
*/
if (unlikely(details))
continue;
- if (!pte_file(ptent))
- free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ if (pte_file(ptent)) {
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
- add_mm_rss(mm, file_rss, anon_rss);
- pte_unmap_unlock(pte - 1, ptl);
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ else if (is_migration_entry(entry)) {
+ struct page *page;
+
+ page = migration_entry_to_page(entry);
+
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]--;
+ else
+ rss[MM_FILEPAGES]--;
+ }
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ add_mm_rss_vec(mm, rss);
+ arch_leave_lazy_mmu_mode();
+
+ /* Do the actual TLB flush before dropping ptl */
+ if (force_flush) {
+ unsigned long old_end;
+
+ /*
+ * Flush the TLB just for the previous segment,
+ * then update the range to be the remaining
+ * TLB range.
+ */
+ old_end = tlb->end;
+ tlb->end = addr;
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb->start = addr;
+ tlb->end = old_end;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+
+ /*
+ * If we forced a TLB flush (either due to running out of
+ * batch buffers or because we needed to flush dirty TLB
+ * entries before releasing the ptl), free the batched
+ * memory too. Restart if we didn't do everything.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu_free(tlb);
+
+ if (addr != end)
+ goto again;
+ }
return addr;
}
@@ -702,7 +1220,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
@@ -710,13 +1228,35 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd)) {
- (*zap_work)--;
- continue;
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE) {
+#ifdef CONFIG_DEBUG_VM
+ if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+ pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+ __func__, addr, end,
+ vma->vm_start,
+ vma->vm_end);
+ BUG();
+ }
+#endif
+ split_huge_page_pmd(vma, addr, pmd);
+ } else if (zap_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
}
- next = zap_pte_range(tlb, vma, pmd, addr, next,
- zap_work, details);
- } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+ /*
+ * Here there can be other concurrent MADV_DONTNEED or
+ * trans huge page faults running, and if the pmd is
+ * none or trans huge it can change under us. This is
+ * because MADV_DONTNEED holds the mmap_sem in read
+ * mode.
+ */
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ goto next;
+ next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
+ cond_resched();
+ } while (pmd++, addr = next, addr != end);
return addr;
}
@@ -724,7 +1264,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+ struct zap_details *details)
{
pud_t *pud;
unsigned long next;
@@ -732,21 +1272,18 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud)) {
- (*zap_work)--;
+ if (pud_none_or_clear_bad(pud))
continue;
- }
- next = zap_pmd_range(tlb, vma, pud, addr, next,
- zap_work, details);
- } while (pud++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+ } while (pud++, addr = next, addr != end);
return addr;
}
-static unsigned long unmap_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
+static void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
@@ -755,46 +1292,72 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
details = NULL;
BUG_ON(addr >= end);
+ mem_cgroup_uncharge_start();
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd)) {
- (*zap_work)--;
+ if (pgd_none_or_clear_bad(pgd))
continue;
- }
- next = zap_pud_range(tlb, vma, pgd, addr, next,
- zap_work, details);
- } while (pgd++, addr = next, (addr != end && *zap_work > 0));
+ next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+ } while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
-
- return addr;
+ mem_cgroup_uncharge_end();
}
-#ifdef CONFIG_PREEMPT
-# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
-#else
-/* No preempt: go for improved straight-line efficiency */
-# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
-#endif
+
+static void unmap_single_vma(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start_addr,
+ unsigned long end_addr,
+ struct zap_details *details)
+{
+ unsigned long start = max(vma->vm_start, start_addr);
+ unsigned long end;
+
+ if (start >= vma->vm_end)
+ return;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return;
+
+ if (vma->vm_file)
+ uprobe_munmap(vma, start, end);
+
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn(vma, 0, 0);
+
+ if (start != end) {
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ /*
+ * It is undesirable to test vma->vm_file as it
+ * should be non-null for valid hugetlb area.
+ * However, vm_file will be NULL in the error
+ * cleanup path of mmap_region. When
+ * hugetlbfs ->mmap method fails,
+ * mmap_region() nullifies vma->vm_file
+ * before calling this function to clean up.
+ * Since no pte has actually been setup, it is
+ * safe to do nothing in this case.
+ */
+ if (vma->vm_file) {
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ }
+ } else
+ unmap_page_range(tlb, vma, start, end, details);
+ }
+}
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
- * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
- * @details: details of nonlinear truncation or shared cache invalidation
- *
- * Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list.
*
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
- *
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
@@ -804,387 +1367,103 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long *nr_accounted,
- struct zap_details *details)
+ unsigned long end_addr)
{
- long zap_work = ZAP_BLOCK_SIZE;
- unsigned long tlb_start = 0; /* For tlb_finish_mmu */
- int tlb_start_valid = 0;
- unsigned long start = start_addr;
- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = (*tlbp)->fullmm;
-
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
- unsigned long end;
-
- start = max(vma->vm_start, start_addr);
- if (start >= vma->vm_end)
- continue;
- end = min(vma->vm_end, end_addr);
- if (end <= vma->vm_start)
- continue;
-
- if (vma->vm_flags & VM_ACCOUNT)
- *nr_accounted += (end - start) >> PAGE_SHIFT;
-
- while (start != end) {
- if (!tlb_start_valid) {
- tlb_start = start;
- tlb_start_valid = 1;
- }
-
- if (unlikely(is_vm_hugetlb_page(vma))) {
- unmap_hugepage_range(vma, start, end);
- zap_work -= (end - start) /
- (HPAGE_SIZE / PAGE_SIZE);
- start = end;
- } else
- start = unmap_page_range(*tlbp, vma,
- start, end, &zap_work, details);
-
- if (zap_work > 0) {
- BUG_ON(start != end);
- break;
- }
-
- tlb_finish_mmu(*tlbp, tlb_start, start);
-
- if (need_resched() ||
- (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
- if (i_mmap_lock) {
- *tlbp = NULL;
- goto out;
- }
- cond_resched();
- }
+ struct mm_struct *mm = vma->vm_mm;
- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
- tlb_start_valid = 0;
- zap_work = ZAP_BLOCK_SIZE;
- }
- }
-out:
- return start; /* which is now the end (or restart) address */
+ mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+ mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
}
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * Caller must protect the VMA list
*/
-unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
- struct mmu_gather *tlb;
- unsigned long end = address + size;
- unsigned long nr_accounted = 0;
+ struct mmu_gather tlb;
+ unsigned long end = start + size;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
- end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
- if (tlb)
- tlb_finish_mmu(tlb, address, end);
- return end;
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, end, details);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
}
-/*
- * Do a quick page-table lookup for a single page.
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of nonlinear truncation or shared cache invalidation
+ *
+ * The range must fit into one VMA.
*/
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep, pte;
- spinlock_t *ptl;
- struct page *page;
struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+ unsigned long end = address + size;
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- goto out;
- }
-
- page = NULL;
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto no_page_table;
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- goto no_page_table;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto no_page_table;
-
- if (pmd_huge(*pmd)) {
- BUG_ON(flags & FOLL_GET);
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- goto out;
- }
-
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!ptep)
- goto out;
-
- pte = *ptep;
- if (!pte_present(pte))
- goto unlock;
- if ((flags & FOLL_WRITE) && !pte_write(pte))
- goto unlock;
- page = vm_normal_page(vma, address, pte);
- if (unlikely(!page))
- goto unlock;
-
- if (flags & FOLL_GET)
- get_page(page);
- if (flags & FOLL_TOUCH) {
- if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
- mark_page_accessed(page);
- }
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
- return page;
-
-no_page_table:
- /*
- * When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate page tables.
- */
- if (flags & FOLL_ANON) {
- page = ZERO_PAGE(address);
- if (flags & FOLL_GET)
- get_page(page);
- BUG_ON(flags & FOLL_WRITE);
- }
- return page;
-}
-
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, int write, int force,
- struct page **pages, struct vm_area_struct **vmas)
-{
- int i;
- unsigned int vm_flags;
-
- /*
- * Require read or write permissions.
- * If 'force' is set, we only require the "MAY" flags.
- */
- vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
- i = 0;
-
- do {
- struct vm_area_struct *vma;
- unsigned int foll_flags;
-
- vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(tsk, start)) {
- unsigned long pg = start & PAGE_MASK;
- struct vm_area_struct *gate_vma = get_gate_vma(tsk);
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- if (write) /* user gate pages are read-only */
- return i ? : -EFAULT;
- if (pg > TASK_SIZE)
- pgd = pgd_offset_k(pg);
- else
- pgd = pgd_offset_gate(mm, pg);
- BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, pg);
- BUG_ON(pud_none(*pud));
- pmd = pmd_offset(pud, pg);
- if (pmd_none(*pmd))
- return i ? : -EFAULT;
- pte = pte_offset_map(pmd, pg);
- if (pte_none(*pte)) {
- pte_unmap(pte);
- return i ? : -EFAULT;
- }
- if (pages) {
- struct page *page = vm_normal_page(gate_vma, start, *pte);
- pages[i] = page;
- if (page)
- get_page(page);
- }
- pte_unmap(pte);
- if (vmas)
- vmas[i] = gate_vma;
- i++;
- start += PAGE_SIZE;
- len--;
- continue;
- }
-
- if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
- || !(vm_flags & vma->vm_flags))
- return i ? : -EFAULT;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
- continue;
- }
-
- foll_flags = FOLL_TOUCH;
- if (pages)
- foll_flags |= FOLL_GET;
- if (!write && !(vma->vm_flags & VM_LOCKED) &&
- (!vma->vm_ops || !vma->vm_ops->nopage))
- foll_flags |= FOLL_ANON;
-
- do {
- struct page *page;
-
- if (write)
- foll_flags |= FOLL_WRITE;
-
- cond_resched();
- while (!(page = follow_page(vma, start, foll_flags))) {
- int ret;
- ret = __handle_mm_fault(mm, vma, start,
- foll_flags & FOLL_WRITE);
- /*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has
- * broken COW when necessary, even if maybe_mkwrite
- * decided not to set pte_write. We can thus safely do
- * subsequent page lookups as if they were reads.
- */
- if (ret & VM_FAULT_WRITE)
- foll_flags &= ~FOLL_WRITE;
-
- switch (ret & ~VM_FAULT_WRITE) {
- case VM_FAULT_MINOR:
- tsk->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- tsk->maj_flt++;
- break;
- case VM_FAULT_SIGBUS:
- return i ? i : -EFAULT;
- case VM_FAULT_OOM:
- return i ? i : -ENOMEM;
- default:
- BUG();
- }
- }
- if (pages) {
- pages[i] = page;
-
- flush_anon_page(page, start);
- flush_dcache_page(page);
- }
- if (vmas)
- vmas[i] = vma;
- i++;
- start += PAGE_SIZE;
- len--;
- } while (len && start < vma->vm_end);
- } while (len);
- return i;
-}
-EXPORT_SYMBOL(get_user_pages);
-
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t prot)
-{
- pte_t *pte;
- spinlock_t *ptl;
-
- pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
- if (!pte)
- return -ENOMEM;
- do {
- struct page *page = ZERO_PAGE(addr);
- pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
- page_cache_get(page);
- page_add_file_rmap(page);
- inc_mm_counter(mm, file_rss);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, addr, pte, zero_pte);
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(pte - 1, ptl);
- return 0;
-}
-
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end, pgprot_t prot)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc(mm, pud, addr);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
- if (zeromap_pte_range(mm, pmd, addr, next, prot))
- return -ENOMEM;
- } while (pmd++, addr = next, addr != end);
- return 0;
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, address, end);
+ update_hiwater_rss(mm);
+ mmu_notifier_invalidate_range_start(mm, address, end);
+ unmap_single_vma(&tlb, vma, address, end, details);
+ mmu_notifier_invalidate_range_end(mm, address, end);
+ tlb_finish_mmu(&tlb, address, end);
}
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end, pgprot_t prot)
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size)
{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc(mm, pgd, addr);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
- if (zeromap_pmd_range(mm, pud, addr, next, prot))
- return -ENOMEM;
- } while (pud++, addr = next, addr != end);
+ if (address < vma->vm_start || address + size > vma->vm_end ||
+ !(vma->vm_flags & VM_PFNMAP))
+ return -1;
+ zap_page_range_single(vma, address, size, NULL);
return 0;
}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
-int zeromap_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long size, pgprot_t prot)
-{
- pgd_t *pgd;
- unsigned long next;
- unsigned long end = addr + size;
- struct mm_struct *mm = vma->vm_mm;
- int err;
-
- BUG_ON(addr >= end);
- pgd = pgd_offset(mm, addr);
- flush_cache_range(vma, addr, end);
- do {
- next = pgd_addr_end(addr, end);
- err = zeromap_pud_range(mm, pgd, addr, next, prot);
- if (err)
- break;
- } while (pgd++, addr = next, addr != end);
- return err;
-}
-
-pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
{
pgd_t * pgd = pgd_offset(mm, addr);
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
+ if (pmd) {
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
}
return NULL;
}
@@ -1196,11 +1475,13 @@ pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlo
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page, pgprot_t prot)
{
+ struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *ptl;
retval = -EINVAL;
if (PageAnon(page))
@@ -1216,11 +1497,13 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter(mm, file_rss);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
+ pte_unmap_unlock(pte, ptl);
+ return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
out:
@@ -1248,18 +1531,126 @@ out:
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
*/
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
- vma->vm_flags |= VM_INSERTPAGE;
- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t prot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int retval;
+ pte_t *pte, entry;
+ spinlock_t *ptl;
+
+ retval = -ENOMEM;
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+ goto out;
+ retval = -EBUSY;
+ if (!pte_none(*pte))
+ goto out_unlock;
+
+ /* Ok, finally just insert the thing.. */
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, entry);
+ update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
+
+ retval = 0;
+out_unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
+ return retval;
+}
+
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ int ret;
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * Technically, architectures with pte_special can avoid all these
+ * restrictions (same for remap_pfn_range). However we would like
+ * consistency in testing and feature parity among all, so we should
+ * try to keep these invariants in place for everybody.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return -EINVAL;
+
+ ret = insert_pfn(vma, addr, pfn, pgprot);
+
+ return ret;
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+
+ /*
+ * If we don't have pte special, then we have to use the pfn_valid()
+ * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+ * refcount the page if pfn_valid is true (hence insert_page rather
+ * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
+ * without pte special, it would there be refcounted as a normal page.
+ */
+ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+ struct page *page;
+
+ page = pfn_to_page(pfn);
+ return insert_page(vma, addr, page, vma->vm_page_prot);
+ }
+ return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -1275,11 +1666,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
+ arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
- set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1295,6 +1688,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
@@ -1348,18 +1742,18 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED is specified all over the place, because
- * in 2.4 it kept swapout's vma scan off this vma; but
- * in 2.6 the LRU scan won't even find its pages, so this
- * flag means no more than count its pages in reserved_vm,
- * and omit it from core dump, even when VM_IO turned off.
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
*/
if (is_cow_mapping(vma->vm_flags)) {
if (addr != vma->vm_start || end != vma->vm_end)
@@ -1367,7 +1761,11 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
vma->vm_pgoff = pfn;
}
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+ if (err)
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -1380,18 +1778,170 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
+
+ if (err)
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
+
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+ unsigned long vm_len, pfn, pages;
+
+ /* Check that the physical memory area passed in looks valid */
+ if (start + len < start)
+ return -EINVAL;
+ /*
+ * You *really* shouldn't map things that aren't page-aligned,
+ * but we've historically allowed it because IO memory might
+ * just have smaller alignment.
+ */
+ len += start & ~PAGE_MASK;
+ pfn = start >> PAGE_SHIFT;
+ pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ /* We start the mapping 'vm_pgoff' pages into the area */
+ if (vma->vm_pgoff > pages)
+ return -EINVAL;
+ pfn += vma->vm_pgoff;
+ pages -= vma->vm_pgoff;
+
+ /* Can we fit all of the mapping? */
+ vm_len = vma->vm_end - vma->vm_start;
+ if (vm_len >> PAGE_SHIFT > pages)
+ return -EINVAL;
+
+ /* Ok, let it rip */
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pte_t *pte;
+ int err;
+ pgtable_t token;
+ spinlock_t *uninitialized_var(ptl);
+
+ pte = (mm == &init_mm) ?
+ pte_alloc_kernel(pmd, addr) :
+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+
+ BUG_ON(pmd_huge(*pmd));
+
+ arch_enter_lazy_mmu_mode();
+
+ token = pmd_pgtable(*pmd);
+
+ do {
+ err = fn(pte++, token, addr, data);
+ if (err)
+ break;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
+
+ if (mm != &init_mm)
+ pte_unmap_unlock(pte-1, ptl);
+ return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err;
+
+ BUG_ON(pud_huge(*pud));
+
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+ return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err;
+
+ pud = pud_alloc(mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+ return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + size;
+ int err;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
/*
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
+ * and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
@@ -1409,21 +1959,10 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
- * servicing faults for write access. In the normal case, do always want
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
- if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
-}
+ debug_dma_assert_idle(src);
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
-{
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
@@ -1431,7 +1970,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
* fails, we just zero-fill it. Live with it.
*/
if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst, KM_USER0);
+ void *kaddr = kmap_atomic(dst);
void __user *uaddr = (void __user *)(va & PAGE_MASK);
/*
@@ -1441,12 +1980,43 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
- kunmap_atomic(kaddr, KM_USER0);
- return;
-
- }
- copy_user_highpage(dst, src, va);
+ clear_page(kaddr);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+ } else
+ copy_user_highpage(dst, src, va, vma);
+}
+
+/*
+ * Notify the address space that the page is about to become writable so that
+ * it can prohibit this or wait for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can sleep if it needs to.
+ */
+static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ struct vm_fault vmf;
+ int ret;
+
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = page->index;
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ vmf.page = page;
+
+ ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
+ if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ return 0; /* retry */
+ }
+ ret |= VM_FAULT_LOCKED;
+ } else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ return ret;
}
/*
@@ -1470,25 +2040,59 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
{
- struct page *old_page, *new_page;
+ struct page *old_page, *new_page = NULL;
pte_t entry;
- int reuse = 0, ret = VM_FAULT_MINOR;
+ int ret = 0;
+ int page_mkwrite = 0;
struct page *dirty_page = NULL;
+ unsigned long mmun_start = 0; /* For mmu_notifiers */
+ unsigned long mmun_end = 0; /* For mmu_notifiers */
old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page)
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable as we can't do any dirty
+ * accounting on raw pfn maps.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ goto reuse;
goto gotten;
+ }
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page)) {
- if (!TestSetPageLocked(old_page)) {
- reuse = can_share_swap_page(old_page);
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ goto unlock;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
+ goto reuse;
}
+ unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
/*
@@ -1497,22 +2101,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- /*
- * Notify the address space that the page is about to
- * become writable so that it can prohibit this or wait
- * for the page to get into an appropriate state.
- *
- * We do this without the lock held, so that it can
- * sleep if it needs to.
- */
+ int tmp;
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
-
- if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
- goto unwritable_page;
-
- page_cache_release(old_page);
-
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
@@ -1521,23 +2118,68 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- if (!pte_same(*page_table, orig_pte))
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
goto unlock;
+ }
+
+ page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
- reuse = 1;
- }
- if (reuse) {
+reuse:
+ /*
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ if (old_page)
+ page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- ptep_set_access_flags(vma, address, page_table, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ if (ptep_set_access_flags(vma, address, page_table, entry,1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- goto unlock;
+
+ if (!dirty_page)
+ return ret;
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * do_shared_fault is protected similarly.
+ */
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page);
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+ }
+ put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ return ret;
}
/*
@@ -1549,16 +2191,25 @@ gotten:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (old_page == ZERO_PAGE(address)) {
- new_page = alloc_zeroed_user_highpage(vma, address);
+
+ if (is_zero_pfn(pte_pfn(orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, address);
+ cow_user_page(new_page, old_page, address, vma);
}
+ __SetPageUptodate(new_page);
+
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto oom_free_new;
+
+ mmun_start = address & PAGE_MASK;
+ mmun_end = mmun_start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* Re-check the pte - we dropped the lock
@@ -1566,149 +2217,107 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page);
if (!PageAnon(old_page)) {
- dec_mm_counter(mm, file_rss);
- inc_mm_counter(mm, anon_rss);
+ dec_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- lazy_mmu_prot_update(entry);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
- lru_cache_add_active(new_page);
+ /*
+ * Clear the pte entry and flush it first, before updating the
+ * pte with the new entry. This will avoid a race condition
+ * seen in the presence of one thread doing SMC and another
+ * thread doing COW.
+ */
+ ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
+ /*
+ * We call the notify macro here because, when using secondary
+ * mmu page tables (such as kvm shadow page tables), we want the
+ * new page to be mapped directly into the secondary page table.
+ */
+ set_pte_at_notify(mm, address, page_table, entry);
+ update_mmu_cache(vma, address, page_table);
+ if (old_page) {
+ /*
+ * Only after switching the pte to the new page may
+ * we remove the mapcount here. Otherwise another
+ * process may come and find the rmap count decremented
+ * before the pte is switched to the new page, and
+ * "reuse" the old page writing into it while our pte
+ * here still points into it and can be read by other
+ * threads.
+ *
+ * The critical issue is to order this
+ * page_remove_rmap with the ptp_clear_flush above.
+ * Those stores are ordered by (if nothing else,)
+ * the barrier present in the atomic_add_negative
+ * in page_remove_rmap.
+ *
+ * Then the TLB flush in ptep_clear_flush ensures that
+ * no process can access the old page before the
+ * decremented mapcount is visible. And the old page
+ * cannot be reused until after the decremented
+ * mapcount is visible. So transitively, TLBs to
+ * old page will be flushed before it can be reused.
+ */
+ page_remove_rmap(old_page);
+ }
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
- }
+ } else
+ mem_cgroup_uncharge_page(new_page);
+
if (new_page)
page_cache_release(new_page);
- if (old_page)
- page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
- set_page_dirty_balance(dirty_page);
- put_page(dirty_page);
+ if (mmun_end > mmun_start)
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ if (old_page) {
+ /*
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
+ */
+ if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ lock_page(old_page); /* LRU manipulation */
+ munlock_vma_page(old_page);
+ unlock_page(old_page);
+ }
+ page_cache_release(old_page);
}
return ret;
+oom_free_new:
+ page_cache_release(new_page);
oom:
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;
-
-unwritable_page:
- page_cache_release(old_page);
- return VM_FAULT_SIGBUS;
}
-/*
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around. Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again. It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- vma->vm_truncate_count = 0;
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- vma->vm_truncate_count = 0;
-}
-
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
- unsigned long restart_addr;
- int need_break;
-
-again:
- restart_addr = vma->vm_truncate_count;
- if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
- start_addr = restart_addr;
- if (start_addr >= end_addr) {
- /* Top of vma has been split off since last time */
- vma->vm_truncate_count = details->truncate_count;
- return 0;
- }
- }
-
- restart_addr = zap_page_range(vma, start_addr,
- end_addr - start_addr, details);
- need_break = need_resched() ||
- need_lockbreak(details->i_mmap_lock);
-
- if (restart_addr >= end_addr) {
- /* We have now completed this vma: mark it so */
- vma->vm_truncate_count = details->truncate_count;
- if (!need_break)
- return 0;
- } else {
- /* Note restart_addr in vma's truncate_count field */
- vma->vm_truncate_count = restart_addr;
- if (!need_break)
- goto again;
- }
-
- spin_unlock(details->i_mmap_lock);
- cond_resched();
- spin_lock(details->i_mmap_lock);
- return -EINTR;
+ zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
-restart:
- vma_prio_tree_foreach(vma, &iter, root,
+ vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
vba = vma->vm_pgoff;
- vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
+ vea = vba + vma_pages(vma) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
@@ -1717,11 +2326,10 @@ restart:
if (zea > vea)
zea = vea;
- if (unmap_mapping_range_vma(vma,
+ unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
- details) < 0)
- goto restart;
+ details);
}
}
@@ -1736,26 +2344,18 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
-restart:
- list_for_each_entry(vma, head, shared.vm_set.list) {
- /* Skip quickly over those we have already dealt with */
- if (vma->vm_truncate_count == details->truncate_count)
- continue;
+ list_for_each_entry(vma, head, shared.nonlinear) {
details->nonlinear_vma = vma;
- if (unmap_mapping_range_vma(vma, vma->vm_start,
- vma->vm_end, details) < 0)
- goto restart;
+ unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
}
}
/**
- * unmap_mapping_range - unmap the portion of all mmaps
- * in the specified address_space corresponding to the specified
- * page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
- * boundary. Note that this is different from vmtruncate(), which
+ * boundary. Note that this is different from truncate_pagecache(), which
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
@@ -1785,169 +2385,17 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
- details.i_mmap_lock = &mapping->i_mmap_lock;
-
- spin_lock(&mapping->i_mmap_lock);
- /* serialize i_size write against truncate_count write */
- smp_wmb();
- /* Protect against page faults, and endless unmapping loops */
- mapping->truncate_count++;
- /*
- * For archs where spin_lock has inclusive semantics like ia64
- * this smp_mb() will prevent to read pagetable contents
- * before the truncate_count increment is visible to
- * other cpus.
- */
- smp_mb();
- if (unlikely(is_restart_addr(mapping->truncate_count))) {
- if (mapping->truncate_count == 0)
- reset_vma_truncate_counts(mapping);
- mapping->truncate_count++;
- }
- details.truncate_count = mapping->truncate_count;
- if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+ mutex_lock(&mapping->i_mmap_mutex);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
-/**
- * vmtruncate - unmap mappings "freed" by truncate() syscall
- * @inode: inode of the file used
- * @offset: file offset to start truncating
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
- */
-int vmtruncate(struct inode * inode, loff_t offset)
-{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
-
- if (inode->i_size < offset)
- goto do_expand;
- /*
- * truncation of in-use swapfiles is disallowed - it would cause
- * subsequent swapout to scribble on the now-freed blocks.
- */
- if (IS_SWAPFILE(inode))
- goto out_busy;
- i_size_write(inode, offset);
- unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, offset);
- goto out_truncate;
-
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out_big;
- i_size_write(inode, offset);
-
-out_truncate:
- if (inode->i_op && inode->i_op->truncate)
- inode->i_op->truncate(inode);
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out_big:
- return -EFBIG;
-out_busy:
- return -ETXTBSY;
-}
-EXPORT_SYMBOL(vmtruncate);
-
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * If the underlying filesystem is not going to provide
- * a way to truncate a range of blocks (punch a hole) -
- * we should return failure right now.
- */
- if (!inode->i_op || !inode->i_op->truncate_range)
- return -ENOSYS;
-
- mutex_lock(&inode->i_mutex);
- down_write(&inode->i_alloc_sem);
- unmap_mapping_range(mapping, offset, (end - offset), 1);
- truncate_inode_pages_range(mapping, offset, end);
- inode->i_op->truncate_range(inode, offset, end);
- up_write(&inode->i_alloc_sem);
- mutex_unlock(&inode->i_mutex);
-
- return 0;
-}
-EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */
-
-/**
- * swapin_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
- * @addr: address to start
- * @vma: user vma this addresses belong to
- *
- * Primitive swap readahead code. We simply read an aligned block of
- * (1 << page_cluster) entries in the swap area. This method is chosen
- * because it doesn't cost us any seek time. We also make sure to queue
- * the 'original' request together with the readahead ones...
- *
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
- */
-void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
-{
-#ifdef CONFIG_NUMA
- struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
-#endif
- int i, num;
- struct page *new_page;
- unsigned long offset;
-
- /*
- * Get the number of handles we should do readahead io to.
- */
- num = valid_swaphandles(entry, &offset);
- for (i = 0; i < num; offset++, i++) {
- /* Ok, do the async read-ahead now */
- new_page = read_swap_cache_async(swp_entry(swp_type(entry),
- offset), vma, addr);
- if (!new_page)
- break;
- page_cache_release(new_page);
-#ifdef CONFIG_NUMA
- /*
- * Find the next applicable VMA for the NUMA policy.
- */
- addr += PAGE_SIZE;
- if (addr == 0)
- vma = NULL;
- if (vma) {
- if (addr >= vma->vm_end) {
- vma = next_vma;
- next_vma = vma ? vma->vm_next : NULL;
- }
- if (vma && addr < vma->vm_start)
- vma = NULL;
- } else {
- if (next_vma && addr >= next_vma->vm_start) {
- vma = next_vma;
- next_vma = vma->vm_next;
- }
- }
-#endif
- }
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
-
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -1955,27 +2403,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
*/
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page, *swapcache;
swp_entry_t entry;
pte_t pte;
- int ret = VM_FAULT_MINOR;
+ int locked;
+ struct mem_cgroup *ptr;
+ int exclusive = 0;
+ int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;
entry = pte_to_swp_entry(orig_pte);
- if (is_migration_entry(entry)) {
- migration_entry_wait(mm, pmd, address);
+ if (unlikely(non_swap_entry(entry))) {
+ if (is_migration_entry(entry)) {
+ migration_entry_wait(mm, pmd, address);
+ } else if (is_hwpoison_entry(entry)) {
+ ret = VM_FAULT_HWPOISON;
+ } else {
+ print_bad_pte(vma, address, orig_pte, NULL);
+ ret = VM_FAULT_SIGBUS;
+ }
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- swapin_readahead(entry, address, vma);
- page = read_swap_cache_async(entry, vma, address);
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -1991,12 +2449,47 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
- grab_swap_token();
+ mem_cgroup_count_vm_event(mm, PGMAJFAULT);
+ } else if (PageHWPoison(page)) {
+ /*
+ * hwpoisoned dirty swapcache pages are kept for killing
+ * owner processes (which may be unknown at hwpoison time)
+ */
+ ret = VM_FAULT_HWPOISON;
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ swapcache = page;
+ goto out_release;
}
+ swapcache = page;
+ locked = lock_page_or_retry(page, mm, flags);
+
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- mark_page_accessed(page);
- lock_page(page);
+ if (!locked) {
+ ret |= VM_FAULT_RETRY;
+ goto out_release;
+ }
+
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
+ */
+ if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ goto out_page;
+
+ page = ksm_might_need_to_copy(page, vma, address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
+ ret = VM_FAULT_OOM;
+ goto out_page;
+ }
/*
* Back out if somebody else already faulted in this pte.
@@ -2010,299 +2503,536 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_nomap;
}
- /* The page isn't present yet, go ahead with the fault. */
+ /*
+ * The page isn't present yet, go ahead with the fault.
+ *
+ * Be careful about the sequence of operations here.
+ * To get its accounting right, reuse_swap_page() must be called
+ * while the page is counted on swap but not yet in mapcount i.e.
+ * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+ * must be called after the swap_free(), or it will never succeed.
+ * Because delete_from_swap_page() may be called by reuse_swap_page(),
+ * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
+ * in page->private. In this case, a record in swap_cgroup is silently
+ * discarded at swap_free().
+ */
- inc_mm_counter(mm, anon_rss);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if (write_access && can_share_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- write_access = 0;
+ flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = 1;
}
-
flush_icache_page(vma, page);
+ if (pte_swp_soft_dirty(orig_pte))
+ pte = pte_mksoft_dirty(pte);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ if (page == swapcache)
+ do_page_add_anon_rmap(page, vma, address, exclusive);
+ else /* ksm created a completely new copy */
+ page_add_new_anon_rmap(page, vma, address);
+ /* It's better to call commit-charge after rmap is established */
+ mem_cgroup_commit_charge_swapin(page, ptr);
swap_free(entry);
- if (vm_swap_full())
- remove_exclusive_swap_page(page);
+ if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ try_to_free_swap(page);
unlock_page(page);
+ if (page != swapcache) {
+ /*
+ * Hold the lock to avoid the swap entry to be reused
+ * until we take the PT lock for the pte_same() check
+ * (to avoid false positives from pte_same). For
+ * further safety release the lock after the swap_free
+ * so that the swap count won't change under a
+ * parallel locked swapcache.
+ */
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
- if (write_access) {
- if (do_wp_page(mm, vma, address,
- page_table, pmd, ptl, pte) == VM_FAULT_OOM)
- ret = VM_FAULT_OOM;
+ if (flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ if (ret & VM_FAULT_ERROR)
+ ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
- lazy_mmu_prot_update(pte);
+ update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
out:
return ret;
out_nomap:
+ mem_cgroup_cancel_charge_swapin(ptr);
pte_unmap_unlock(page_table, ptl);
+out_page:
unlock_page(page);
+out_release:
page_cache_release(page);
+ if (page != swapcache) {
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
return ret;
}
/*
+ * This is like a special single-page "expand_{down|up}wards()",
+ * except we must first make sure that 'address{-|+}PAGE_SIZE'
+ * doesn't hit another vma.
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+ address &= PAGE_MASK;
+ if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+ struct vm_area_struct *prev = vma->vm_prev;
+
+ /*
+ * Is there a mapping abutting this one below?
+ *
+ * That's only ok if it's the same stack mapping
+ * that has gotten split..
+ */
+ if (prev && prev->vm_end == address)
+ return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+
+ expand_downwards(vma, address - PAGE_SIZE);
+ }
+ if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ struct vm_area_struct *next = vma->vm_next;
+
+ /* As VM_GROWSDOWN but s/below/above/ */
+ if (next && next->vm_start == address + PAGE_SIZE)
+ return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+
+ expand_upwards(vma, address + PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
+ unsigned int flags)
{
struct page *page;
spinlock_t *ptl;
pte_t entry;
- if (write_access) {
- /* Allocate our own private page. */
- pte_unmap(page_table);
-
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_zeroed_user_highpage(vma, address);
- if (!page)
- goto oom;
+ pte_unmap(page_table);
- entry = mk_pte(page, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ /* Check if we need to add a guard page to the stack */
+ if (check_stack_guard_page(vma, address) < 0)
+ return VM_FAULT_SIGBUS;
+ /* Use the zero-page for reads */
+ if (!(flags & FAULT_FLAG_WRITE)) {
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
+ vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
- goto release;
- inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(page);
- page_add_new_anon_rmap(page, vma, address);
- } else {
- /* Map the ZERO_PAGE - vm_page_prot is readonly */
- page = ZERO_PAGE(address);
- page_cache_get(page);
- entry = mk_pte(page, vma->vm_page_prot);
-
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- if (!pte_none(*page_table))
- goto release;
- inc_mm_counter(mm, file_rss);
- page_add_file_rmap(page);
+ goto unlock;
+ goto setpte;
}
+ /* Allocate our own private page. */
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_zeroed_user_highpage_movable(vma, address);
+ if (!page)
+ goto oom;
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
+ goto oom_free_page;
+
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_none(*page_table))
+ goto release;
+
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+setpte:
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
- return VM_FAULT_MINOR;
+ return 0;
release:
+ mem_cgroup_uncharge_page(page);
page_cache_release(page);
goto unlock;
+oom_free_page:
+ page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
-/*
- * do_no_page() tries to create a new page mapping. It aggressively
- * tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+static int __do_fault(struct vm_area_struct *vma, unsigned long address,
+ pgoff_t pgoff, unsigned int flags, struct page **page)
+{
+ struct vm_fault vmf;
+ int ret;
+
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+ vmf.page = NULL;
+
+ ret = vma->vm_ops->fault(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ if (unlikely(PageHWPoison(vmf.page))) {
+ if (ret & VM_FAULT_LOCKED)
+ unlock_page(vmf.page);
+ page_cache_release(vmf.page);
+ return VM_FAULT_HWPOISON;
+ }
+
+ if (unlikely(!(ret & VM_FAULT_LOCKED)))
+ lock_page(vmf.page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+
+ *page = vmf.page;
+ return ret;
+}
+
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
*
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
*/
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+ struct page *page, pte_t *pte, bool write, bool anon)
{
- spinlock_t *ptl;
- struct page *new_page;
- struct address_space *mapping = NULL;
pte_t entry;
- unsigned int sequence = 0;
- int ret = VM_FAULT_MINOR;
- int anon = 0;
- struct page *dirty_page = NULL;
- pte_unmap(page_table);
- BUG_ON(vma->vm_flags & VM_PFNMAP);
-
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- sequence = mapping->truncate_count;
- smp_rmb(); /* serializes i_size against truncate_count */
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (write)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
+ pte_mksoft_dirty(entry);
+ if (anon) {
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+ page_add_file_rmap(page);
}
-retry:
- new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
- /*
- * No smp_rmb is needed here as long as there's a full
- * spin_lock/unlock sequence inside the ->nopage callback
- * (for the pagecache lookup) that acts as an implicit
- * smp_mb() and prevents the i_size read to happen
- * after the next truncate_count read.
- */
+ set_pte_at(vma->vm_mm, address, pte, entry);
- /* no page was available -- either SIGBUS or OOM */
- if (new_page == NOPAGE_SIGBUS)
- return VM_FAULT_SIGBUS;
- if (new_page == NOPAGE_OOM)
- return VM_FAULT_OOM;
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, pte);
+}
- /*
- * Should we do an early C-O-W break?
- */
- if (write_access) {
- if (!(vma->vm_flags & VM_SHARED)) {
- struct page *page;
+static unsigned long fault_around_bytes = rounddown_pow_of_two(65536);
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!page)
- goto oom;
- copy_user_highpage(page, new_page, address);
- page_cache_release(new_page);
- new_page = page;
- anon = 1;
+static inline unsigned long fault_around_pages(void)
+{
+ return fault_around_bytes >> PAGE_SHIFT;
+}
- } else {
- /* if the page will be shareable, see if the backing
- * address space wants to know that the page is about
- * to become writable */
- if (vma->vm_ops->page_mkwrite &&
- vma->vm_ops->page_mkwrite(vma, new_page) < 0
- ) {
- page_cache_release(new_page);
- return VM_FAULT_SIGBUS;
- }
- }
- }
+static inline unsigned long fault_around_mask(void)
+{
+ return ~(fault_around_bytes - 1) & PAGE_MASK;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int fault_around_bytes_get(void *data, u64 *val)
+{
+ *val = fault_around_bytes;
+ return 0;
+}
+
+/*
+ * fault_around_pages() and fault_around_mask() expects fault_around_bytes
+ * rounded down to nearest page order. It's what do_fault_around() expects to
+ * see.
+ */
+static int fault_around_bytes_set(void *data, u64 val)
+{
+ if (val / PAGE_SIZE > PTRS_PER_PTE)
+ return -EINVAL;
+ if (val > PAGE_SIZE)
+ fault_around_bytes = rounddown_pow_of_two(val);
+ else
+ fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
+ fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
+ if (!ret)
+ pr_warn("Failed to create fault_around_bytes in debugfs");
+ return 0;
+}
+late_initcall(fault_around_debugfs);
+#endif
+
+/*
+ * do_fault_around() tries to map few pages around the fault address. The hope
+ * is that the pages will be needed soon and this will lower the number of
+ * faults to handle.
+ *
+ * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
+ * not ready to be mapped: not up-to-date, locked, etc.
+ *
+ * This function is called with the page table lock taken. In the split ptlock
+ * case the page table lock only protects only those entries which belong to
+ * the page table corresponding to the fault address.
+ *
+ * This function doesn't cross the VMA boundaries, in order to call map_pages()
+ * only once.
+ *
+ * fault_around_pages() defines how many pages we'll try to map.
+ * do_fault_around() expects it to return a power of two less than or equal to
+ * PTRS_PER_PTE.
+ *
+ * The virtual address of the area that we map is naturally aligned to the
+ * fault_around_pages() value (and therefore to page order). This way it's
+ * easier to guarantee that we don't cross page table boundaries.
+ */
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+ unsigned long start_addr;
+ pgoff_t max_pgoff;
+ struct vm_fault vmf;
+ int off;
+
+ start_addr = max(address & fault_around_mask(), vma->vm_start);
+ off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ pte -= off;
+ pgoff -= off;
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
- * For a file-backed vma, someone could have truncated or otherwise
- * invalidated this page. If unmap_mapping_range got called,
- * retry getting the page.
+ * max_pgoff is either end of page table or end of vma
+ * or fault_around_pages() from pgoff, depending what is nearest.
*/
- if (mapping && unlikely(sequence != mapping->truncate_count)) {
- pte_unmap_unlock(page_table, ptl);
- page_cache_release(new_page);
- cond_resched();
- sequence = mapping->truncate_count;
- smp_rmb();
- goto retry;
+ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ PTRS_PER_PTE - 1;
+ max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+ pgoff + fault_around_pages() - 1);
+
+ /* Check if it makes any sense to call ->map_pages */
+ while (!pte_none(*pte)) {
+ if (++pgoff > max_pgoff)
+ return;
+ start_addr += PAGE_SIZE;
+ if (start_addr >= vma->vm_end)
+ return;
+ pte++;
}
+ vmf.virtual_address = (void __user *) start_addr;
+ vmf.pte = pte;
+ vmf.pgoff = pgoff;
+ vmf.max_pgoff = max_pgoff;
+ vmf.flags = flags;
+ vma->vm_ops->map_pages(vma, &vmf);
+}
+
+static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ struct page *fault_page;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret = 0;
+
/*
- * This silly early PAGE_DIRTY setting removes a race
- * due to the bad i386 page protection. But it's valid
- * for other architectures too.
- *
- * Note that if write_access is true, we either now have
- * an exclusive copy of the page, or this is a shared mapping,
- * so we can make it writable and dirty to avoid having to
- * handle that later.
+ * Let's call ->map_pages() first and use ->fault() as fallback
+ * if page by the offset is not ready to be mapped (cold cache or
+ * something).
*/
- /* Only go through if we didn't race with anybody else... */
- if (pte_none(*page_table)) {
- flush_icache_page(vma, new_page);
- entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte_at(mm, address, page_table, entry);
- if (anon) {
- inc_mm_counter(mm, anon_rss);
- lru_cache_add_active(new_page);
- page_add_new_anon_rmap(new_page, vma, address);
- } else {
- inc_mm_counter(mm, file_rss);
- page_add_file_rmap(new_page);
- if (write_access) {
- dirty_page = new_page;
- get_page(dirty_page);
- }
- }
- } else {
- /* One of our sibling threads was faster, back out. */
+ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
+ fault_around_pages() > 1) {
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ do_fault_around(vma, address, pte, pgoff, flags);
+ if (!pte_same(*pte, orig_pte))
+ goto unlock_out;
+ pte_unmap_unlock(pte, ptl);
+ }
+
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ return ret;
+ }
+ do_set_pte(vma, address, fault_page, pte, false, false);
+ unlock_page(fault_page);
+unlock_out:
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ struct page *fault_page, *new_page;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!new_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
page_cache_release(new_page);
- goto unlock;
+ return VM_FAULT_OOM;
}
- /* no need to invalidate: a not-present page shouldn't be cached */
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
-unlock:
- pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
- set_page_dirty_balance(dirty_page);
- put_page(dirty_page);
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ goto uncharge_out;
+
+ copy_user_highpage(new_page, fault_page, address, vma);
+ __SetPageUptodate(new_page);
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ goto uncharge_out;
}
+ do_set_pte(vma, address, new_page, pte, true, true);
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
return ret;
-oom:
+uncharge_out:
+ mem_cgroup_uncharge_page(new_page);
page_cache_release(new_page);
- return VM_FAULT_OOM;
+ return ret;
}
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
+static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
+ struct page *fault_page;
+ struct address_space *mapping;
spinlock_t *ptl;
- pte_t entry;
- unsigned long pfn;
- int ret = VM_FAULT_MINOR;
+ pte_t *pte;
+ int dirtied = 0;
+ int ret, tmp;
- pte_unmap(page_table);
- BUG_ON(!(vma->vm_flags & VM_PFNMAP));
- BUG_ON(is_cow_mapping(vma->vm_flags));
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
- pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
- if (pfn == NOPFN_OOM)
- return VM_FAULT_OOM;
- if (pfn == NOPFN_SIGBUS)
- return VM_FAULT_SIGBUS;
+ /*
+ * Check if the backing address space wants to know that the page is
+ * about to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ unlock_page(fault_page);
+ tmp = do_page_mkwrite(vma, fault_page, address);
+ if (unlikely(!tmp ||
+ (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(fault_page);
+ return tmp;
+ }
+ }
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ return ret;
+ }
+ do_set_pte(vma, address, fault_page, pte, true, false);
+ pte_unmap_unlock(pte, ptl);
- /* Only go through if we didn't race with anybody else... */
- if (pte_none(*page_table)) {
- entry = pfn_pte(pfn, vma->vm_page_prot);
- if (write_access)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte_at(mm, address, page_table, entry);
+ if (set_page_dirty(fault_page))
+ dirtied = 1;
+ mapping = fault_page->mapping;
+ unlock_page(fault_page);
+ if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping but still
+ * dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- pte_unmap_unlock(page_table, ptl);
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+ file_update_time(vma->vm_file);
+
return ret;
}
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ unsigned int flags, pte_t orig_pte)
+{
+ pgoff_t pgoff = (((address & PAGE_MASK)
+ - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ pte_unmap(page_table);
+ if (!(flags & FAULT_FLAG_WRITE))
+ return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ if (!(vma->vm_flags & VM_SHARED))
+ return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+}
+
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
@@ -2312,33 +3042,123 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff;
- int err;
+
+ flags |= FAULT_FLAG_NONLINEAR;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return VM_FAULT_MINOR;
+ return 0;
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
- print_bad_pte(vma, orig_pte, address);
- return VM_FAULT_OOM;
+ print_bad_pte(vma, address, orig_pte, NULL);
+ return VM_FAULT_SIGBUS;
}
- /* We can then assume vm->vm_ops && vma->vm_ops->populate */
pgoff = pte_to_pgoff(orig_pte);
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
- vma->vm_page_prot, pgoff, 0);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_MAJOR;
+ if (!(flags & FAULT_FLAG_WRITE))
+ return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ if (!(vma->vm_flags & VM_SHARED))
+ return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+}
+
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid,
+ int *flags)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (page_nid == numa_node_id()) {
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ *flags |= TNF_FAULT_LOCAL;
+ }
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ int page_nid = -1;
+ int last_cpupid;
+ int target_nid;
+ bool migrated = false;
+ int flags = 0;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ *
+ * ptep_modify_prot_start is not called as this is clearing
+ * the _PAGE_NUMA bit and it is not really expected that there
+ * would be concurrent hardware modifications to the PTE.
+ */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*ptep, pte))) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ pte = pte_mknonnuma(pte);
+ set_pte_at(mm, addr, ptep, pte);
+ update_mmu_cache(vma, addr, ptep);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+ BUG_ON(is_zero_pfn(page_to_pfn(page)));
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pte_write(pte))
+ flags |= TNF_NO_GROUP;
+
+ /*
+ * Flag if the page is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ flags |= TNF_SHARED;
+
+ last_cpupid = page_cpupid_last(page);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, vma, target_nid);
+ if (migrated) {
+ page_nid = target_nid;
+ flags |= TNF_MIGRATED;
+ }
+
+out:
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, 1, flags);
+ return 0;
}
/*
@@ -2354,51 +3174,47 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, int write_access)
+static int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
- pte_t old_entry;
spinlock_t *ptl;
- old_entry = entry = *pte;
+ entry = *pte;
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
- if (vma->vm_ops->nopage)
- return do_no_page(mm, vma, address,
- pte, pmd,
- write_access);
- if (unlikely(vma->vm_ops->nopfn))
- return do_no_pfn(mm, vma, address, pte,
- pmd, write_access);
+ if (likely(vma->vm_ops->fault))
+ return do_linear_fault(mm, vma, address,
+ pte, pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
+ pte, pmd, flags);
}
if (pte_file(entry))
- return do_file_page(mm, vma, address,
- pte, pmd, write_access, entry);
+ return do_nonlinear_fault(mm, vma, address,
+ pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
- pte, pmd, write_access, entry);
+ pte, pmd, flags, entry);
}
+ if (pte_numa(entry))
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
- if (write_access) {
+ if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (!pte_same(old_entry, entry)) {
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -2406,31 +3222,27 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (write_access)
- flush_tlb_page(vma, address);
+ if (flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
- return VM_FAULT_MINOR;
+ return 0;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- __set_current_state(TASK_RUNNING);
-
- count_vm_event(PGFAULT);
-
if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, write_access);
+ return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
@@ -2439,14 +3251,104 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- pte = pte_alloc_map(mm, pmd, address);
- if (!pte)
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = VM_FAULT_FALLBACK;
+ if (!vma->vm_ops)
+ ret = do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ pmd_t orig_pmd = *pmd;
+ int ret;
+
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ /*
+ * If the pmd is splitting, return and retry the
+ * the fault. Alternative: wait until the split
+ * is done, and goto retry.
+ */
+ if (pmd_trans_splitting(orig_pmd))
+ return 0;
+
+ if (pmd_numa(orig_pmd))
+ return do_huge_pmd_numa_page(mm, vma, address,
+ orig_pmd, pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
+ ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+ orig_pmd);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ huge_pmd_set_accessed(mm, vma, address, pmd,
+ orig_pmd, dirty);
+ return 0;
+ }
+ }
+ }
+
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
- return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
+{
+ int ret;
+
+ __set_current_state(TASK_RUNNING);
+
+ count_vm_event(PGFAULT);
+ mem_cgroup_count_vm_event(mm, PGFAULT);
+
+ /* do counter updates before entering really critical section. */
+ check_sync_rss_stat(current);
+
+ /*
+ * Enable the memcg OOM handling for faults triggered in user
+ * space. Kernel faults are handled more gracefully.
+ */
+ if (flags & FAULT_FLAG_USER)
+ mem_cgroup_oom_enable();
+
+ ret = __handle_mm_fault(mm, vma, address, flags);
+
+ if (flags & FAULT_FLAG_USER) {
+ mem_cgroup_oom_disable();
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ }
+
+ return ret;
+}
#ifndef __PAGETABLE_PUD_FOLDED
/*
@@ -2459,20 +3361,16 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
- pud_free(new);
+ pud_free(mm, new);
else
pgd_populate(mm, pgd, new);
spin_unlock(&mm->page_table_lock);
return 0;
}
-#else
-/* Workaround for gcc 2.96 */
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
-{
- return 0;
-}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
@@ -2486,88 +3384,25 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
+ smp_wmb(); /* See comment in __pte_alloc */
+
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) /* Another has populated it */
- pmd_free(new);
+ pmd_free(mm, new);
else
pud_populate(mm, pud, new);
#else
if (pgd_present(*pud)) /* Another has populated it */
- pmd_free(new);
+ pmd_free(mm, new);
else
pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
}
-#else
-/* Workaround for gcc 2.96 */
-int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
-{
- return 0;
-}
#endif /* __PAGETABLE_PMD_FOLDED */
-int make_pages_present(unsigned long addr, unsigned long end)
-{
- int ret, len, write;
- struct vm_area_struct * vma;
-
- vma = find_vma(current->mm, addr);
- if (!vma)
- return -1;
- write = (vma->vm_flags & VM_WRITE) != 0;
- BUG_ON(addr >= end);
- BUG_ON(end > vma->vm_end);
- len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
- ret = get_user_pages(current, current->mm, addr,
- len, write, 0, NULL, NULL);
- if (ret < 0)
- return ret;
- return ret == len ? 0 : -1;
-}
-
-/*
- * Map a vmalloc()-space virtual address to the physical page.
- */
-struct page * vmalloc_to_page(void * vmalloc_addr)
-{
- unsigned long addr = (unsigned long) vmalloc_addr;
- struct page *page = NULL;
- pgd_t *pgd = pgd_offset_k(addr);
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep, pte;
-
- if (!pgd_none(*pgd)) {
- pud = pud_offset(pgd, addr);
- if (!pud_none(*pud)) {
- pmd = pmd_offset(pud, addr);
- if (!pmd_none(*pmd)) {
- ptep = pte_offset_map(pmd, addr);
- pte = *ptep;
- if (pte_present(pte))
- page = pte_page(pte);
- pte_unmap(ptep);
- }
- }
- }
- return page;
-}
-
-EXPORT_SYMBOL(vmalloc_to_page);
-
-/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
- */
-unsigned long vmalloc_to_pfn(void * vmalloc_addr)
-{
- return page_to_pfn(vmalloc_to_page(vmalloc_addr));
-}
-
-EXPORT_SYMBOL(vmalloc_to_pfn);
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
@@ -2578,14 +3413,15 @@ static int __init gate_vma_init(void)
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
- gate_vma.vm_page_prot = PAGE_READONLY;
- gate_vma.vm_flags = 0;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+
return 0;
}
__initcall(gate_vma_init);
#endif
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
@@ -2594,7 +3430,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
#endif
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -2605,55 +3441,392 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto out;
+
+ /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+ if (pmd_huge(*pmd))
+ goto out;
+
+ ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+ if (!ptep)
+ goto out;
+ if (!pte_present(*ptep))
+ goto unlock;
+ *ptepp = ptep;
+ return 0;
+unlock:
+ pte_unmap_unlock(ptep, *ptlp);
+out:
+ return -EINVAL;
+}
+
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ return res;
+}
+
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ unsigned long *pfn)
+{
+ int ret = -EINVAL;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return ret;
+
+ ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+ if (ret)
+ return ret;
+ *pfn = pte_pfn(*ptep);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+int follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot, resource_size_t *phys)
+{
+ int ret = -EINVAL;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
+
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ goto out;
+ pte = *ptep;
+
+ if ((flags & FOLL_WRITE) && !pte_write(pte))
+ goto unlock;
+
+ *prot = pgprot_val(pte_pgprot(pte));
+ *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ ret = 0;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+out:
+ return ret;
+}
+
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ resource_size_t phys_addr;
+ unsigned long prot = 0;
+ void __iomem *maddr;
+ int offset = addr & (PAGE_SIZE-1);
+
+ if (follow_phys(vma, addr, write, &prot, &phys_addr))
+ return -EINVAL;
+
+ maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ if (write)
+ memcpy_toio(maddr + offset, buf, len);
+ else
+ memcpy_fromio(buf, maddr + offset, len);
+ iounmap(maddr);
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(generic_access_phys);
+#endif
+
/*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
+ * Access another process' address space as given in mm. If non-NULL, use the
+ * given task for page fault accounting.
*/
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
{
- struct mm_struct *mm;
struct vm_area_struct *vma;
- struct page *page;
void *old_buf = buf;
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
down_read(&mm->mmap_sem);
- /* ignore errors, just check how much was sucessfully transfered */
+ /* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
+ struct page *page = NULL;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
- if (ret <= 0)
- break;
-
- bytes = len;
- offset = addr & (PAGE_SIZE-1);
- if (bytes > PAGE_SIZE-offset)
- bytes = PAGE_SIZE-offset;
-
- maddr = kmap(page);
- if (write) {
- copy_to_user_page(vma, page, addr,
- maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ if (ret <= 0) {
+ /*
+ * Check if this is a VM_IO | VM_PFNMAP VMA, which
+ * we can access using slightly different code.
+ */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr)
+ break;
+ if (vma->vm_ops && vma->vm_ops->access)
+ ret = vma->vm_ops->access(vma, addr, buf,
+ len, write);
+ if (ret <= 0)
+#endif
+ break;
+ bytes = ret;
} else {
- copy_from_user_page(vma, page, addr,
- buf, maddr + offset, bytes);
+ bytes = len;
+ offset = addr & (PAGE_SIZE-1);
+ if (bytes > PAGE_SIZE-offset)
+ bytes = PAGE_SIZE-offset;
+
+ maddr = kmap(page);
+ if (write) {
+ copy_to_user_page(vma, page, addr,
+ maddr + offset, buf, bytes);
+ set_page_dirty_lock(page);
+ } else {
+ copy_from_user_page(vma, page, addr,
+ buf, maddr + offset, bytes);
+ }
+ kunmap(page);
+ page_cache_release(page);
}
- kunmap(page);
- page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
- mmput(mm);
return buf - old_buf;
}
+
+/**
+ * access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+ mmput(mm);
+
+ return ret;
+}
+
+/*
+ * Print the name of a VMA.
+ */
+void print_vma_addr(char *prefix, unsigned long ip)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ /*
+ * Do not print if we are in atomic
+ * contexts (in exception stacks, etc.):
+ */
+ if (preempt_count())
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, ip);
+ if (vma && vma->vm_file) {
+ struct file *f = vma->vm_file;
+ char *buf = (char *)__get_free_page(GFP_KERNEL);
+ if (buf) {
+ char *p;
+
+ p = d_path(&f->f_path, buf, PAGE_SIZE);
+ if (IS_ERR(p))
+ p = "?";
+ printk("%s%s[%lx+%lx]", prefix, kbasename(p),
+ vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ free_page((unsigned long)buf);
+ }
+ }
+ up_read(&mm->mmap_sem);
+}
+
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
+void might_fault(void)
+{
+ /*
+ * Some code (nfs/sunrpc) uses socket ops on kernel memory while
+ * holding the mmap_sem, this is safe because kernel memory doesn't
+ * get paged out, therefore we'll never actually fault, and the
+ * below annotations will generate false positives.
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ return;
+
+ /*
+ * it would be nicer only to annotate paths which are not under
+ * pagefault_disable, however that requires a larger audit and
+ * providing helpers like get_user_atomic.
+ */
+ if (in_atomic())
+ return;
+
+ __might_sleep(__FILE__, __LINE__, 0);
+
+ if (current->mm)
+ might_lock_read(&current->mm->mmap_sem);
+}
+EXPORT_SYMBOL(might_fault);
+#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+
+static struct kmem_cache *page_ptl_cachep;
+
+void __init ptlock_cache_init(void)
+{
+ page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+ SLAB_PANIC, NULL);
+}
+
+bool ptlock_alloc(struct page *page)
+{
+ spinlock_t *ptl;
+
+ ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
+ if (!ptl)
+ return false;
+ page->ptl = ptl;
+ return true;
+}
+
+void ptlock_free(struct page *page)
+{
+ kmem_cache_free(page_ptl_cachep, page->ptl);
+}
+#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b7..469bbf505f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,10 +9,10 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagevec.h>
+#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
@@ -21,41 +21,457 @@
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/mm_inline.h>
+#include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
+#include <linux/hugetlb.h>
+#include <linux/memblock.h>
#include <asm/tlbflush.h>
-extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
- unsigned long size);
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+#include "internal.h"
+
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+
+static void generic_online_page(struct page *page);
+
+static online_page_callback_t online_page_callback = generic_online_page;
+static DEFINE_MUTEX(online_page_callback_lock);
+
+/* The same as the cpu_hotplug lock, but for memory hotplug. */
+static struct {
+ struct task_struct *active_writer;
+ struct mutex lock; /* Synchronizes accesses to refcount, */
+ /*
+ * Also blocks the new readers during
+ * an ongoing mem hotplug operation.
+ */
+ int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} mem_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
+ .refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "mem_hotplug.lock" },
+#endif
+};
+
+/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
+#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
+#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
+#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+
+void get_online_mems(void)
+{
+ might_sleep();
+ if (mem_hotplug.active_writer == current)
+ return;
+ memhp_lock_acquire_read();
+ mutex_lock(&mem_hotplug.lock);
+ mem_hotplug.refcount++;
+ mutex_unlock(&mem_hotplug.lock);
+
+}
+
+void put_online_mems(void)
+{
+ if (mem_hotplug.active_writer == current)
+ return;
+ mutex_lock(&mem_hotplug.lock);
+
+ if (WARN_ON(!mem_hotplug.refcount))
+ mem_hotplug.refcount++; /* try to fix things up */
+
+ if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
+ wake_up_process(mem_hotplug.active_writer);
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+
+}
+
+static void mem_hotplug_begin(void)
+{
+ mem_hotplug.active_writer = current;
+
+ memhp_lock_acquire();
+ for (;;) {
+ mutex_lock(&mem_hotplug.lock);
+ if (likely(!mem_hotplug.refcount))
+ break;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&mem_hotplug.lock);
+ schedule();
+ }
+}
+
+static void mem_hotplug_done(void)
+{
+ mem_hotplug.active_writer = NULL;
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+}
+
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size)
+{
+ struct resource *res;
+ res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+ BUG_ON(!res);
+
+ res->name = "System RAM";
+ res->start = start;
+ res->end = start + size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ if (request_resource(&iomem_resource, res) < 0) {
+ pr_debug("System RAM resource %pR cannot be added\n", res);
+ kfree(res);
+ res = NULL;
+ }
+ return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ release_resource(res);
+ kfree(res);
+ return;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
+{
+ page->lru.next = (struct list_head *) type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ atomic_inc(&page->_count);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type;
+
+ type = (unsigned long) page->lru.next;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (atomic_dec_return(&page->_count) == 1) {
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ free_reserved_page(page);
+ }
+}
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long *usemap, mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usemap = __nr_to_section(section_nr)->pageblock_flags;
+ page = virt_to_page(usemap);
+
+ mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long *usemap, mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+
+ if (!pfn_valid(start_pfn))
+ return;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usemap = __nr_to_section(section_nr)->pageblock_flags;
+ page = virt_to_page(usemap);
+
+ mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+ struct zone *zone;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ zone = &pgdat->node_zones[0];
+ for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
+ if (zone_is_initialized(zone)) {
+ nr_pages = zone->wait_table_hash_nr_entries
+ * sizeof(wait_queue_head_t);
+ nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
+ page = virt_to_page(zone->wait_table);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+ }
+ }
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
+#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
+
+static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_zone_end_pfn;
+
+ zone_span_writelock(zone);
+
+ old_zone_end_pfn = zone_end_pfn(zone);
+ if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
+ zone->zone_start_pfn;
+
+ zone_span_writeunlock(zone);
+}
+
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ zone_span_writelock(zone);
+
+ if (end_pfn - start_pfn) {
+ zone->zone_start_pfn = start_pfn;
+ zone->spanned_pages = end_pfn - start_pfn;
+ } else {
+ /*
+ * make it consist as free_area_init_core(),
+ * if spanned_pages = 0, then keep start_pfn = 0
+ */
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
+
+ zone_span_writeunlock(zone);
+}
+
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ enum zone_type zid = zone_idx(zone);
+ int nid = zone->zone_pgdat->node_id;
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+
+/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
+static int __ref ensure_zone_is_initialized(struct zone *zone,
+ unsigned long start_pfn, unsigned long num_pages)
+{
+ if (!zone_is_initialized(zone))
+ return init_currently_empty_zone(zone, start_pfn, num_pages,
+ MEMMAP_HOTPLUG);
+ return 0;
+}
+
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z1_start_pfn;
+
+ ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are higher than @z2 */
+ if (end_pfn > zone_end_pfn(z2))
+ goto out_fail;
+ /* the move out part must be at the left most of @z2 */
+ if (start_pfn > z2->zone_start_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (end_pfn <= z2->zone_start_pfn)
+ goto out_fail;
+
+ /* use start_pfn for z1's start_pfn if z1 is empty */
+ if (!zone_is_empty(z1))
+ z1_start_pfn = z1->zone_start_pfn;
+ else
+ z1_start_pfn = start_pfn;
+
+ resize_zone(z1, z1_start_pfn, end_pfn);
+ resize_zone(z2, end_pfn, zone_end_pfn(z2));
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z1, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z2_end_pfn;
+
+ ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are lower than @z1 */
+ if (z1->zone_start_pfn > start_pfn)
+ goto out_fail;
+ /* the move out part mast at the right most of @z1 */
+ if (zone_end_pfn(z1) > end_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (start_pfn >= zone_end_pfn(z1))
+ goto out_fail;
+
+ /* use end_pfn for z2's end_pfn if z2 is empty */
+ if (!zone_is_empty(z2))
+ z2_end_pfn = zone_end_pfn(z2);
+ else
+ z2_end_pfn = end_pfn;
+
+ resize_zone(z1, z1->zone_start_pfn, start_pfn);
+ resize_zone(z2, start_pfn, z2_end_pfn);
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z2, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
+static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
+
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
+ pgdat->node_start_pfn;
+}
+
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
+ unsigned long flags;
+ int ret;
zone_type = zone - pgdat->node_zones;
- if (!populated_zone(zone)) {
- int ret = 0;
- ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
- if (ret < 0)
- return ret;
- }
- memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
- zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+ ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
+ if (ret)
+ return ret;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
+ grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
+ phys_start_pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ memmap_init_zone(nr_pages, nid, zone_type,
+ phys_start_pfn, MEMMAP_HOTPLUG);
return 0;
}
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
- int nr_pages);
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(int nid, struct zone *zone,
+ unsigned long phys_start_pfn)
{
- int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
+ ret = sparse_add_one_section(zone, phys_start_pfn);
if (ret < 0)
return ret;
@@ -65,7 +481,7 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
if (ret < 0)
return ret;
- return register_new_memory(__pfn_to_section(phys_start_pfn));
+ return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
/*
@@ -74,8 +490,8 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
{
unsigned long i;
int err = 0;
@@ -85,10 +501,10 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
/*
- * EEXIST is finally dealed with by ioresource collision
+ * EEXIST is finally dealt with by ioresource collision
* check. see add_memory() => register_memory_resource()
* Warning will be printed if there is collision.
*/
@@ -101,116 +517,581 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__add_pages);
-static void grow_zone_span(struct zone *zone,
- unsigned long start_pfn, unsigned long end_pfn)
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
{
- unsigned long old_zone_end_pfn;
+ struct mem_section *ms;
+
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(start_pfn);
+
+ if (unlikely(!valid_section(ms)))
+ continue;
+
+ if (unlikely(pfn_to_nid(start_pfn) != nid))
+ continue;
+
+ if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ continue;
+
+ return start_pfn;
+ }
+
+ return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct mem_section *ms;
+ unsigned long pfn;
+
+ /* pfn is the end pfn of a memory section. */
+ pfn = end_pfn - 1;
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
+
+ if (unlikely(!valid_section(ms)))
+ continue;
+
+ if (unlikely(pfn_to_nid(pfn) != nid))
+ continue;
+
+ if (zone && zone != page_zone(pfn_to_page(pfn)))
+ continue;
+
+ return pfn;
+ }
+
+ return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+ unsigned long zone_end_pfn = z;
+ unsigned long pfn;
+ struct mem_section *ms;
+ int nid = zone_to_nid(zone);
zone_span_writelock(zone);
+ if (zone_start_pfn == start_pfn) {
+ /*
+ * If the section is smallest section in the zone, it need
+ * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+ * In this case, we find second smallest valid mem_section
+ * for shrinking zone.
+ */
+ pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+ zone_end_pfn);
+ if (pfn) {
+ zone->zone_start_pfn = pfn;
+ zone->spanned_pages = zone_end_pfn - pfn;
+ }
+ } else if (zone_end_pfn == end_pfn) {
+ /*
+ * If the section is biggest section in the zone, it need
+ * shrink zone->spanned_pages.
+ * In this case, we find second biggest valid mem_section for
+ * shrinking zone.
+ */
+ pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ start_pfn);
+ if (pfn)
+ zone->spanned_pages = pfn - zone_start_pfn + 1;
+ }
- old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
- if (start_pfn < zone->zone_start_pfn)
- zone->zone_start_pfn = start_pfn;
+ /*
+ * The section is not biggest or smallest mem_section in the zone, it
+ * only creates a hole in the zone. So in this case, we need not
+ * change the zone. But perhaps, the zone has only hole data. Thus
+ * it check the zone has only hole or not.
+ */
+ pfn = zone_start_pfn;
+ for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
- zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
- zone->zone_start_pfn;
+ if (unlikely(!valid_section(ms)))
+ continue;
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ continue;
+
+ /* If the section is current section, it continues the loop */
+ if (start_pfn == pfn)
+ continue;
+
+ /* If we find valid section, we have nothing to do */
+ zone_span_writeunlock(zone);
+ return;
+ }
+
+ /* The zone has no valid section */
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
-static void grow_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+ unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long old_pgdat_end_pfn =
- pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+ unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
+ unsigned long pgdat_end_pfn = p;
+ unsigned long pfn;
+ struct mem_section *ms;
+ int nid = pgdat->node_id;
- if (start_pfn < pgdat->node_start_pfn)
- pgdat->node_start_pfn = start_pfn;
+ if (pgdat_start_pfn == start_pfn) {
+ /*
+ * If the section is smallest section in the pgdat, it need
+ * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+ * In this case, we find second smallest valid mem_section
+ * for shrinking zone.
+ */
+ pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+ pgdat_end_pfn);
+ if (pfn) {
+ pgdat->node_start_pfn = pfn;
+ pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+ }
+ } else if (pgdat_end_pfn == end_pfn) {
+ /*
+ * If the section is biggest section in the pgdat, it need
+ * shrink pgdat->node_spanned_pages.
+ * In this case, we find second biggest valid mem_section for
+ * shrinking zone.
+ */
+ pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+ start_pfn);
+ if (pfn)
+ pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+ }
- pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
- pgdat->node_start_pfn;
+ /*
+ * If the section is not biggest or smallest mem_section in the pgdat,
+ * it only creates a hole in the pgdat. So in this case, we need not
+ * change the pgdat.
+ * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+ * has only hole or not.
+ */
+ pfn = pgdat_start_pfn;
+ for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
+
+ if (unlikely(!valid_section(ms)))
+ continue;
+
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+
+ /* If the section is current section, it continues the loop */
+ if (start_pfn == pfn)
+ continue;
+
+ /* If we find valid section, we have nothing to do */
+ return;
+ }
+
+ /* The pgdat has no valid section */
+ pgdat->node_start_pfn = 0;
+ pgdat->node_spanned_pages = 0;
}
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nr_pages = PAGES_PER_SECTION;
+ int zone_type;
+ unsigned long flags;
+
+ zone_type = zone - pgdat->node_zones;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+ shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+ unsigned long start_pfn;
+ int scn_nr;
+ int ret = -EINVAL;
+
+ if (!valid_section(ms))
+ return ret;
+
+ ret = unregister_memory_section(ms);
+ if (ret)
+ return ret;
+
+ scn_nr = __section_nr(ms);
+ start_pfn = section_nr_to_pfn(scn_nr);
+ __remove_zone(zone, start_pfn);
+
+ sparse_remove_one_section(zone, ms);
+ return 0;
+}
+
+/**
+ * __remove_pages() - remove sections of pages from a zone
+ * @zone: zone from which pages need to be removed
+ * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i;
+ int sections_to_remove;
+ resource_size_t start, size;
+ int ret = 0;
+
+ /*
+ * We can only remove entire sections
+ */
+ BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
+ BUG_ON(nr_pages % PAGES_PER_SECTION);
+
+ start = phys_start_pfn << PAGE_SHIFT;
+ size = nr_pages * PAGE_SIZE;
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+
+ sections_to_remove = nr_pages / PAGES_PER_SECTION;
+ for (i = 0; i < sections_to_remove; i++) {
+ unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ ret = __remove_section(zone, __pfn_to_section(pfn));
+ if (ret)
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__remove_pages);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+int set_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
+
+ if (online_page_callback == generic_online_page) {
+ online_page_callback = callback;
+ rc = 0;
+ }
+
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+
+int restore_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
+
+ if (online_page_callback == callback) {
+ online_page_callback = generic_online_page;
+ rc = 0;
+ }
+
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+
+void __online_page_set_limits(struct page *page)
+{
+}
+EXPORT_SYMBOL_GPL(__online_page_set_limits);
+
+void __online_page_increment_counters(struct page *page)
+{
+ adjust_managed_page_count(page, 1);
+}
+EXPORT_SYMBOL_GPL(__online_page_increment_counters);
+
+void __online_page_free(struct page *page)
+{
+ __free_reserved_page(page);
+}
+EXPORT_SYMBOL_GPL(__online_page_free);
+
+static void generic_online_page(struct page *page)
+{
+ __online_page_set_limits(page);
+ __online_page_increment_counters(page);
+ __online_page_free(page);
+}
+
+static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
{
unsigned long i;
+ unsigned long onlined_pages = *(unsigned long *)arg;
+ struct page *page;
+ if (PageReserved(pfn_to_page(start_pfn)))
+ for (i = 0; i < nr_pages; i++) {
+ page = pfn_to_page(start_pfn + i);
+ (*online_page_callback)(page);
+ onlined_pages++;
+ }
+ *(unsigned long *)arg = onlined_pages;
+ return 0;
+}
+
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+#endif /* CONFIG_MOVABLE_NODE */
+
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ int nid = zone_to_nid(zone);
+ enum zone_type zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * if the memory to be online is in a zone of 0...zone_last, and
+ * the zones of 0...zone_last don't have memory before online, we will
+ * need to set the node to node_states[N_NORMAL_MEMORY] after
+ * the memory is online.
+ */
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+ arg->status_change_nid_normal = nid;
+ else
+ arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+ arg->status_change_nid_high = nid;
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * if the node don't have memory befor online, we will need to
+ * set the node to node_states[N_MEMORY] after the memory
+ * is online.
+ */
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_set_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_set_state(node, N_HIGH_MEMORY);
+
+ node_set_state(node, N_MEMORY);
+}
+
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
+{
unsigned long flags;
unsigned long onlined_pages = 0;
- struct resource res;
- u64 section_end;
- unsigned long start_pfn;
struct zone *zone;
int need_zonelists_rebuild = 0;
+ int nid;
+ int ret;
+ struct memory_notify arg;
+ mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
- * memory_block->state_sem.
+ * memory_block->state_mutex.
*/
zone = page_zone(pfn_to_page(pfn));
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- grow_zone_span(zone, pfn, pfn + nr_pages);
- grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ ret = -EINVAL;
+ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+ !can_online_high_movable(zone))
+ goto out;
+
+ if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
+ goto out;
+ }
+ if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
+ goto out;
+ }
+
+ /* Previous code may changed the zone of the pfn range */
+ zone = page_zone(pfn_to_page(pfn));
+
+ arg.start_pfn = pfn;
+ arg.nr_pages = nr_pages;
+ node_states_check_changes_online(nr_pages, zone, &arg);
+
+ nid = pfn_to_nid(pfn);
+
+ ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ goto out;
+ }
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
- if (!populated_zone(zone))
+ mutex_lock(&zonelists_mutex);
+ if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
+ build_all_zonelists(NULL, zone);
+ }
- res.start = (u64)pfn << PAGE_SHIFT;
- res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
- res.flags = IORESOURCE_MEM; /* we just need system ram */
- section_end = res.end;
-
- while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
- start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
- nr_pages = (unsigned long)
- ((res.end + 1 - res.start) >> PAGE_SHIFT);
-
- if (PageReserved(pfn_to_page(start_pfn))) {
- /* this region's page is not onlined now */
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pfn_to_page(start_pfn + i);
- online_page(page);
- onlined_pages++;
- }
- }
-
- res.start = res.end + 1;
- res.end = section_end;
+ ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
+ online_pages_range);
+ if (ret) {
+ if (need_zonelists_rebuild)
+ zone_pcp_reset(zone);
+ mutex_unlock(&zonelists_mutex);
+ printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages)
+ << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ goto out;
}
+
zone->present_pages += onlined_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
zone->zone_pgdat->node_present_pages += onlined_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ if (onlined_pages) {
+ node_states_set_node(zone_to_nid(zone), &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL, NULL);
+ else
+ zone_pcp_update(zone);
+ }
+
+ mutex_unlock(&zonelists_mutex);
- setup_per_zone_pages_min();
+ init_per_zone_wmark_min();
+
+ if (onlined_pages)
+ kswapd_run(zone_to_nid(zone));
- if (need_zonelists_rebuild)
- build_all_zonelists();
vm_total_pages = nr_free_pagecache_pages();
- return 0;
+
+ writeback_set_ratelimit();
+
+ if (onlined_pages)
+ memory_notify(MEM_ONLINE, &arg);
+out:
+ mem_hotplug_done();
+ return ret;
}
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
unsigned long zholes_size[MAX_NR_ZONES] = {0};
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = PFN_DOWN(start);
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- return NULL;
+ pgdat = NODE_DATA(nid);
+ if (!pgdat) {
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat)
+ return NULL;
- arch_refresh_nodedata(nid, pgdat);
+ arch_refresh_nodedata(nid, pgdat);
+ }
/* we can use NODE_DATA(nid) from here */
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+ free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+
+ /*
+ * The node we allocated has no zone fallback lists. For avoiding
+ * to access not-initialized zonelist, build here.
+ */
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(pgdat, NULL);
+ mutex_unlock(&zonelists_mutex);
return pgdat;
}
@@ -222,55 +1103,89 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
return;
}
-/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+
+/**
+ * try_online_node - online a node if offlined
+ *
+ * called by cpu_up() to online a node without onlined memory.
+ */
+int try_online_node(int nid)
{
- struct resource *res;
- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- BUG_ON(!res);
+ pg_data_t *pgdat;
+ int ret;
- res->name = "System RAM";
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_MEM;
- if (request_resource(&iomem_resource, res) < 0) {
- printk("System RAM resource %llx - %llx cannot be added\n",
- (unsigned long long)res->start, (unsigned long long)res->end);
- kfree(res);
- res = NULL;
+ if (node_online(nid))
+ return 0;
+
+ mem_hotplug_begin();
+ pgdat = hotadd_new_pgdat(nid, 0);
+ if (!pgdat) {
+ pr_err("Cannot online node %d due to NULL pgdat\n", nid);
+ ret = -ENOMEM;
+ goto out;
}
- return res;
+ node_set_online(nid);
+ ret = register_one_node(nid);
+ BUG_ON(ret);
+
+ if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
+
+out:
+ mem_hotplug_done();
+ return ret;
}
-static void release_memory_resource(struct resource *res)
+static int check_hotplug_memory_range(u64 start, u64 size)
{
- if (!res)
- return;
- release_resource(res);
- kfree(res);
- return;
-}
+ u64 start_pfn = PFN_DOWN(start);
+ u64 nr_pages = size >> PAGE_SHIFT;
+ /* Memory range must be aligned with section */
+ if ((start_pfn & ~PAGE_SECTION_MASK) ||
+ (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
+ pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
+ (unsigned long long)start,
+ (unsigned long long)size);
+ return -EINVAL;
+ }
+ return 0;
+}
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
{
pg_data_t *pgdat = NULL;
- int new_pgdat = 0;
+ bool new_pgdat;
+ bool new_node;
struct resource *res;
int ret;
+ ret = check_hotplug_memory_range(start, size);
+ if (ret)
+ return ret;
+
res = register_memory_resource(start, size);
+ ret = -EEXIST;
if (!res)
- return -EEXIST;
+ return ret;
+
+ { /* Stupid hack to suppress address-never-null warning */
+ void *p = NODE_DATA(nid);
+ new_pgdat = !p;
+ }
- if (!node_online(nid)) {
+ mem_hotplug_begin();
+
+ new_node = !node_online(nid);
+ if (new_node) {
pgdat = hotadd_new_pgdat(nid, start);
+ ret = -ENOMEM;
if (!pgdat)
- return -ENOMEM;
- new_pgdat = 1;
- ret = kswapd_run(nid);
- if (ret)
goto error;
}
@@ -283,7 +1198,7 @@ int add_memory(int nid, u64 start, u64 size)
/* we online node here. we can't roll back from here. */
node_set_online(nid);
- if (new_pgdat) {
+ if (new_node) {
ret = register_one_node(nid);
/*
* If sysfs file of new node can't create, cpu on the node
@@ -293,14 +1208,768 @@ int add_memory(int nid, u64 start, u64 size)
BUG_ON(ret);
}
- return ret;
+ /* create new memmap entry */
+ firmware_map_add_hotplug(start, start + size, "System RAM");
+
+ goto out;
+
error:
/* rollback pgdat allocation and others */
if (new_pgdat)
rollback_node_hotadd(nid, pgdat);
- if (res)
- release_memory_resource(res);
+ release_memory_resource(res);
+out:
+ mem_hotplug_done();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+ return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+ /* Ensure the starting page is pageblock-aligned */
+ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+
+ /* If the entire pageblock is free, move to the end of free page */
+ if (pageblock_free(page)) {
+ int order;
+ /* be careful. we don't have locks, page_order can be changed.*/
+ order = page_order(page);
+ if ((order < MAX_ORDER) && (order >= pageblock_order))
+ return page + (1 << order);
+ }
+
+ return page + pageblock_nr_pages;
+}
+
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct page *page = pfn_to_page(start_pfn);
+ struct page *end_page = page + nr_pages;
+
+ /* Check the starting page of each pageblock within the range */
+ for (; page < end_page; page = next_active_pageblock(page)) {
+ if (!is_pageblock_removable_nolock(page))
+ return 0;
+ cond_resched();
+ }
+
+ /* All pageblocks in the memory block are likely to be hot-removable */
+ return 1;
+}
+
+/*
+ * Confirm all pages in a range [start, end) is belongs to the same zone.
+ */
+static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct zone *zone = NULL;
+ struct page *page;
+ int i;
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += MAX_ORDER_NR_PAGES) {
+ i = 0;
+ /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+ while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
+ i++;
+ if (i == MAX_ORDER_NR_PAGES)
+ continue;
+ page = pfn_to_page(pfn + i);
+ if (zone && page_zone(page) != zone)
+ return 0;
+ zone = page_zone(page);
+ }
+ return 1;
+}
+
+/*
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
+ * and hugepages). We scan pfn because it's much easier than scanning over
+ * linked list. This function returns the pfn of the first found movable
+ * page if it's found, otherwise 0.
+ */
+static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+ struct page *page;
+ for (pfn = start; pfn < end; pfn++) {
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ if (PageHuge(page)) {
+ if (is_hugepage_active(page))
+ return pfn;
+ else
+ pfn = round_up(pfn + 1,
+ 1 << compound_order(page)) - 1;
+ }
+ }
+ }
+ return 0;
+}
+
+#define NR_OFFLINE_AT_ONCE_PAGES (256)
+static int
+do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+ struct page *page;
+ int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
+ int not_managed = 0;
+ int ret = 0;
+ LIST_HEAD(source);
+
+ for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+
+ if (PageHuge(page)) {
+ struct page *head = compound_head(page);
+ pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+ if (compound_order(head) > PFN_SECTION_SHIFT) {
+ ret = -EBUSY;
+ break;
+ }
+ if (isolate_huge_page(page, &source))
+ move_pages -= 1 << compound_order(head);
+ continue;
+ }
+
+ if (!get_page_unless_zero(page))
+ continue;
+ /*
+ * We can skip free pages. And we can only deal with pages on
+ * LRU.
+ */
+ ret = isolate_lru_page(page);
+ if (!ret) { /* Success */
+ put_page(page);
+ list_add_tail(&page->lru, &source);
+ move_pages--;
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+
+ } else {
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
+ pfn);
+ dump_page(page, "failed to remove from LRU");
+#endif
+ put_page(page);
+ /* Because we don't have big zone->lock. we should
+ check this again here. */
+ if (page_count(page)) {
+ not_managed++;
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+ if (!list_empty(&source)) {
+ if (not_managed) {
+ putback_movable_pages(&source);
+ goto out;
+ }
+
+ /*
+ * alloc_migrate_target should be improooooved!!
+ * migrate_pages returns # of failed pages.
+ */
+ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
+ MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ if (ret)
+ putback_movable_pages(&source);
+ }
+out:
+ return ret;
+}
+
+/*
+ * remove from free_area[] and mark all as Reserved.
+ */
+static int
+offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
+ void *data)
+{
+ __offline_isolated_pages(start, start + nr_pages);
+ return 0;
+}
+
+static void
+offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
+ offline_isolated_pages_cb);
+}
+
+/*
+ * Check all pages in range, recoreded as memory resource, are isolated.
+ */
+static int
+check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
+ void *data)
+{
+ int ret;
+ long offlined = *(long *)data;
+ ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
+ offlined = nr_pages;
+ if (!ret)
+ *(long *)data += offlined;
+ return ret;
+}
+
+static long
+check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+ long offlined = 0;
+ int ret;
+
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
+ check_pages_isolated_cb);
+ if (ret < 0)
+ offlined = (long)ret;
+ return offlined;
+}
+
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt;
+
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ if (present_pages > nr_pages)
+ return true;
+
+ present_pages = 0;
+ for (; zt <= ZONE_MOVABLE; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ /*
+ * we can't offline the last normal memory until all
+ * higher memory is offlined.
+ */
+ return present_pages == 0;
+}
+#endif /* CONFIG_MOVABLE_NODE */
+
+static int __init cmdline_parse_movable_node(char *p)
+{
+#ifdef CONFIG_MOVABLE_NODE
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ memblock_set_bottom_up(true);
+ movable_node_enabled = true;
+#else
+ pr_warn("movable_node option not supported\n");
+#endif
+ return 0;
+}
+early_param("movable_node", cmdline_parse_movable_node);
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt, zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is in a zone of 0...zone_last,
+ * and it is the last present memory, 0...zone_last will
+ * become empty after offline , thus we can determind we will
+ * need to clear the node from node_states[N_NORMAL_MEMORY].
+ */
+ for (zt = 0; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_normal = zone_to_nid(zone);
+ else
+ arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_high = zone_to_nid(zone);
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ */
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_HIGH_MEMORY] will be changed
+ * If we try to offline the last present @nr_pages from the node,
+ * we can determind we will need to clear the node from
+ * node_states[N_HIGH_MEMORY].
+ */
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (nr_pages >= present_pages)
+ arg->status_change_nid = zone_to_nid(zone);
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_clear_state(node, N_NORMAL_MEMORY);
+
+ if ((N_MEMORY != N_NORMAL_MEMORY) &&
+ (arg->status_change_nid_high >= 0))
+ node_clear_state(node, N_HIGH_MEMORY);
+
+ if ((N_MEMORY != N_HIGH_MEMORY) &&
+ (arg->status_change_nid >= 0))
+ node_clear_state(node, N_MEMORY);
+}
+
+static int __ref __offline_pages(unsigned long start_pfn,
+ unsigned long end_pfn, unsigned long timeout)
+{
+ unsigned long pfn, nr_pages, expire;
+ long offlined_pages;
+ int ret, drain, retry_max, node;
+ unsigned long flags;
+ struct zone *zone;
+ struct memory_notify arg;
+
+ /* at least, alignment against pageblock is necessary */
+ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ /* This makes hotplug much easier...and readable.
+ we assume this for now. .*/
+ if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ return -EINVAL;
+
+ mem_hotplug_begin();
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ node = zone_to_nid(zone);
+ nr_pages = end_pfn - start_pfn;
+
+ ret = -EINVAL;
+ if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+ goto out;
+
+ /* set above range as isolated */
+ ret = start_isolate_page_range(start_pfn, end_pfn,
+ MIGRATE_MOVABLE, true);
+ if (ret)
+ goto out;
+
+ arg.start_pfn = start_pfn;
+ arg.nr_pages = nr_pages;
+ node_states_check_changes_offline(nr_pages, zone, &arg);
+
+ ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_removal;
+
+ pfn = start_pfn;
+ expire = jiffies + timeout;
+ drain = 0;
+ retry_max = 5;
+repeat:
+ /* start memory hot removal */
+ ret = -EAGAIN;
+ if (time_after(jiffies, expire))
+ goto failed_removal;
+ ret = -EINTR;
+ if (signal_pending(current))
+ goto failed_removal;
+ ret = 0;
+ if (drain) {
+ lru_add_drain_all();
+ cond_resched();
+ drain_all_pages();
+ }
+
+ pfn = scan_movable_pages(start_pfn, end_pfn);
+ if (pfn) { /* We have movable pages */
+ ret = do_migrate_range(pfn, end_pfn);
+ if (!ret) {
+ drain = 1;
+ goto repeat;
+ } else {
+ if (ret < 0)
+ if (--retry_max == 0)
+ goto failed_removal;
+ yield();
+ drain = 1;
+ goto repeat;
+ }
+ }
+ /* drain all zone's lru pagevec, this is asynchronous... */
+ lru_add_drain_all();
+ yield();
+ /* drain pcp pages, this is synchronous. */
+ drain_all_pages();
+ /*
+ * dissolve free hugepages in the memory block before doing offlining
+ * actually in order to make hugetlbfs's object counting consistent.
+ */
+ dissolve_free_huge_pages(start_pfn, end_pfn);
+ /* check again */
+ offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+ if (offlined_pages < 0) {
+ ret = -EBUSY;
+ goto failed_removal;
+ }
+ printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ /* Ok, all of our target is isolated.
+ We cannot do rollback at this point. */
+ offline_isolated_pages(start_pfn, end_pfn);
+ /* reset pagetype flags and makes migrate type to be MOVABLE */
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ /* removal success */
+ adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
+ zone->present_pages -= offlined_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages -= offlined_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ init_per_zone_wmark_min();
+
+ if (!populated_zone(zone)) {
+ zone_pcp_reset(zone);
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ } else
+ zone_pcp_update(zone);
+
+ node_states_clear_node(node, &arg);
+ if (arg.status_change_nid >= 0)
+ kswapd_stop(node);
+
+ vm_total_pages = nr_free_pagecache_pages();
+ writeback_set_ratelimit();
+
+ memory_notify(MEM_OFFLINE, &arg);
+ mem_hotplug_done();
+ return 0;
+
+failed_removal:
+ printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ /* pushback to free area */
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+
+out:
+ mem_hotplug_done();
+ return ret;
+}
+
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+/**
+ * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
+ * @start_pfn: start pfn of the memory range
+ * @end_pfn: end pfn of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present mem sections in range
+ * [start_pfn, end_pfn) and call func on each mem section.
+ *
+ * Returns the return value of func.
+ */
+int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg, int (*func)(struct memory_block *, void *))
+{
+ struct memory_block *mem = NULL;
+ struct mem_section *section;
+ unsigned long pfn, section_nr;
+ int ret;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ section_nr = pfn_to_section_nr(pfn);
+ if (!present_section_nr(section_nr))
+ continue;
+
+ section = __nr_to_section(section_nr);
+ /* same memblock? */
+ if (mem)
+ if ((section_nr >= mem->start_section_nr) &&
+ (section_nr <= mem->end_section_nr))
+ continue;
+
+ mem = find_memory_block_hinted(section, mem);
+ if (!mem)
+ continue;
+
+ ret = func(mem, arg);
+ if (ret) {
+ kobject_put(&mem->dev.kobj);
+ return ret;
+ }
+ }
+
+ if (mem)
+ kobject_put(&mem->dev.kobj);
+
+ return 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
+{
+ int ret = !is_memblock_offlined(mem);
+
+ if (unlikely(ret)) {
+ phys_addr_t beginpa, endpa;
+
+ beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+ endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ pr_warn("removing memory fails, because memory "
+ "[%pa-%pa] is onlined\n",
+ &beginpa, &endpa);
+ }
+
+ return ret;
+}
+
+static int check_cpu_on_node(pg_data_t *pgdat)
+{
+ int cpu;
+
+ for_each_present_cpu(cpu) {
+ if (cpu_to_node(cpu) == pgdat->node_id)
+ /*
+ * the cpu on this node isn't removed, and we can't
+ * offline this node.
+ */
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void unmap_cpu_on_node(pg_data_t *pgdat)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ if (cpu_to_node(cpu) == pgdat->node_id)
+ numa_clear_node(cpu);
+#endif
+}
+
+static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
+{
+ int ret;
+
+ ret = check_cpu_on_node(pgdat);
+ if (ret)
+ return ret;
+
+ /*
+ * the node will be offlined when we come here, so we can clear
+ * the cpu_to_node() now.
+ */
+
+ unmap_cpu_on_node(pgdat);
+ return 0;
+}
+
+/**
+ * try_offline_node
+ *
+ * Offline a node if all memory sections and cpus of the node are removed.
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call.
+ */
+void try_offline_node(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = pgdat->node_start_pfn;
+ unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+ unsigned long pfn;
+ struct page *pgdat_page = virt_to_page(pgdat);
+ int i;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ if (!present_section_nr(section_nr))
+ continue;
+
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+
+ /*
+ * some memory sections of this node are not removed, and we
+ * can't offline node now.
+ */
+ return;
+ }
+
+ if (check_and_unmap_cpu_on_node(pgdat))
+ return;
+
+ /*
+ * all memory/cpu of this node are removed, we can offline this
+ * node now.
+ */
+ node_set_offline(nid);
+ unregister_one_node(nid);
+
+ if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+ /* node data is allocated from boot memory */
+ return;
+
+ /* free waittable in each zone */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ /*
+ * wait_table may be allocated from boot memory,
+ * here only free if it's allocated by vmalloc.
+ */
+ if (is_vmalloc_addr(zone->wait_table))
+ vfree(zone->wait_table);
+ }
+
+ /*
+ * Since there is no way to guarentee the address of pgdat/zone is not
+ * on stack of any kernel threads or used by other kernel objects
+ * without reference counting or other symchronizing method, do not
+ * reset node_data and free pgdat here. Just reset it to 0 and reuse
+ * the memory when the node is online again.
+ */
+ memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+
+/**
+ * remove_memory
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __ref remove_memory(int nid, u64 start, u64 size)
+{
+ int ret;
+
+ BUG_ON(check_hotplug_memory_range(start, size));
+
+ mem_hotplug_begin();
+
+ /*
+ * All memory blocks must be offlined before removing memory. Check
+ * whether all memory blocks in question are offline and trigger a BUG()
+ * if this is not the case.
+ */
+ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
+ check_memblock_offlined_cb);
+ if (ret)
+ BUG();
+
+ /* remove memmap entry */
+ firmware_map_remove(start, start + size, "System RAM");
+
+ arch_remove_memory(start, size);
+
+ try_offline_node(nid);
+
+ mem_hotplug_done();
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cf18f094255..8f5330d74f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
- * As a special case node -1 here means do the allocation
+ * As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
@@ -63,151 +63,427 @@
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
- could replace all the switch()es with a mempolicy_ops structure.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
-#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
-#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/ctype.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+#include <linux/random.h>
+
+#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
-#define PDprintk(fmt...)
-
/* Highest zone. An specific allocation for a zone below that is not
policied. */
-enum zone_type policy_zone = ZONE_DMA;
+enum zone_type policy_zone = 0;
-struct mempolicy default_policy = {
+/*
+ * run-time system-wide default policy => local allocation
+ */
+static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .policy = MPOL_DEFAULT,
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL,
};
-/* Do sanity checking on a policy */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
{
- int empty = nodes_empty(*nodes);
+ struct mempolicy *pol = p->mempolicy;
- switch (mode) {
- case MPOL_DEFAULT:
- if (!empty)
- return -EINVAL;
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- /* Preferred will only use the first bit, but allow
- more for now. */
- if (empty)
- return -EINVAL;
- break;
+ if (!pol) {
+ int node = numa_node_id();
+
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
+ /*
+ * preferred_node_policy is not initialised early in
+ * boot
+ */
+ if (!pol->mode)
+ pol = NULL;
+ }
}
- return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+
+ return pol;
}
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+static const struct mempolicy_operations {
+ int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+ /*
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step);
+} mpol_ops[MPOL_MAX];
+
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(const nodemask_t *nodemask)
{
- struct zonelist *zl;
- int num, max, nd;
- enum zone_type k;
+ return nodes_intersects(*nodemask, node_states[N_MEMORY]);
+}
- max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
- if (!zl)
- return NULL;
- num = 0;
- /* First put in the highest zones from all nodes, then all the next
- lower zones etc. Avoid empty zones because the memory allocator
- doesn't like them. If you implement node hot removal you
- have to fix that. */
- k = policy_zone;
- while (1) {
- for_each_node_mask(nd, *nodes) {
- struct zone *z = &NODE_DATA(nd)->node_zones[k];
- if (z->present_pages > 0)
- zl->zones[num++] = z;
- }
- if (k == 0)
- break;
- k--;
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
+{
+ return pol->flags & MPOL_MODE_FLAGS;
+}
+
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
+ const nodemask_t *rel)
+{
+ nodemask_t tmp;
+ nodes_fold(tmp, *orig, nodes_weight(*rel));
+ nodes_onto(*ret, tmp, *rel);
+}
+
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!nodes)
+ pol->flags |= MPOL_F_LOCAL; /* local allocation */
+ else if (nodes_empty(*nodes))
+ return -EINVAL; /* no allowed nodes */
+ else
+ pol->v.preferred_node = first_node(*nodes);
+ return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+ if (!is_valid_nodemask(nodes))
+ return -EINVAL;
+ pol->v.nodes = *nodes;
+ return 0;
+}
+
+/*
+ * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
+ * any, for the new policy. mpol_new() has already validated the nodes
+ * parameter with respect to the policy mode and flags. But, we need to
+ * handle an empty nodemask with MPOL_PREFERRED here.
+ *
+ * Must be called holding task's alloc_lock to protect task's mems_allowed
+ * and mempolicy. May also be called holding the mmap_semaphore for write.
+ */
+static int mpol_set_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes, struct nodemask_scratch *nsc)
+{
+ int ret;
+
+ /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
+ if (pol == NULL)
+ return 0;
+ /* Check N_MEMORY */
+ nodes_and(nsc->mask1,
+ cpuset_current_mems_allowed, node_states[N_MEMORY]);
+
+ VM_BUG_ON(!nodes);
+ if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
+ nodes = NULL; /* explicit local allocation */
+ else {
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
+
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
+ else
+ pol->w.cpuset_mems_allowed =
+ cpuset_current_mems_allowed;
}
- zl->zones[num] = NULL;
- return zl;
+
+ if (nodes)
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ else
+ ret = mpol_ops[pol->mode].create(pol, NULL);
+ return ret;
}
-/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+/*
+ * This function just creates a new policy, does some check and simple
+ * initialization. You must invoke mpol_set_nodemask() to set nodes.
+ */
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
- if (mode == MPOL_DEFAULT)
+ pr_debug("setting mode %d flags %d nodes[0] %lx\n",
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
+
+ if (mode == MPOL_DEFAULT) {
+ if (nodes && !nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
return NULL;
+ }
+ VM_BUG_ON(!nodes);
+
+ /*
+ * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+ * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+ * All other modes require a valid pointer to a non-empty nodemask.
+ */
+ if (mode == MPOL_PREFERRED) {
+ if (nodes_empty(*nodes)) {
+ if (((flags & MPOL_F_STATIC_NODES) ||
+ (flags & MPOL_F_RELATIVE_NODES)))
+ return ERR_PTR(-EINVAL);
+ }
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
+ } else if (nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
- switch (mode) {
- case MPOL_INTERLEAVE:
- policy->v.nodes = *nodes;
- if (nodes_weight(*nodes) == 0) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-EINVAL);
- }
- break;
- case MPOL_PREFERRED:
- policy->v.preferred_node = first_node(*nodes);
- if (policy->v.preferred_node >= MAX_NUMNODES)
- policy->v.preferred_node = -1;
- break;
- case MPOL_BIND:
- policy->v.zonelist = bind_zonelist(nodes);
- if (policy->v.zonelist == NULL) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-ENOMEM);
- }
- break;
- }
- policy->policy = mode;
- policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+ policy->mode = mode;
+ policy->flags = flags;
+
return policy;
}
-static void gather_stats(struct page *, void *, int pte_dirty);
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+ if (!atomic_dec_and_test(&p->refcnt))
+ return;
+ kmem_cache_free(policy_cache, p);
+}
+
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
+{
+}
+
+/*
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES)
+ nodes_and(tmp, pol->w.user_nodemask, *nodes);
+ else if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ else {
+ /*
+ * if step == 1, we use ->w.cpuset_mems_allowed to cache the
+ * result
+ */
+ if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
+ nodes_remap(tmp, pol->v.nodes,
+ pol->w.cpuset_mems_allowed, *nodes);
+ pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
+ } else if (step == MPOL_REBIND_STEP2) {
+ tmp = pol->w.cpuset_mems_allowed;
+ pol->w.cpuset_mems_allowed = *nodes;
+ } else
+ BUG();
+ }
+
+ if (nodes_empty(tmp))
+ tmp = *nodes;
+
+ if (step == MPOL_REBIND_STEP1)
+ nodes_or(pol->v.nodes, pol->v.nodes, tmp);
+ else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
+ pol->v.nodes = tmp;
+ else
+ BUG();
+
+ if (!node_isset(current->il_next, tmp)) {
+ current->il_next = next_node(current->il_next, tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = first_node(tmp);
+ if (current->il_next >= MAX_NUMNODES)
+ current->il_next = numa_node_id();
+ }
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+ const nodemask_t *nodes,
+ enum mpol_rebind_step step)
+{
+ nodemask_t tmp;
+
+ if (pol->flags & MPOL_F_STATIC_NODES) {
+ int node = first_node(pol->w.user_nodemask);
+
+ if (node_isset(node, *nodes)) {
+ pol->v.preferred_node = node;
+ pol->flags &= ~MPOL_F_LOCAL;
+ } else
+ pol->flags |= MPOL_F_LOCAL;
+ } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+ mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+ pol->v.preferred_node = first_node(tmp);
+ } else if (!(pol->flags & MPOL_F_LOCAL)) {
+ pol->v.preferred_node = node_remap(pol->v.preferred_node,
+ pol->w.cpuset_mems_allowed,
+ *nodes);
+ pol->w.cpuset_mems_allowed = *nodes;
+ }
+}
+
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
+ enum mpol_rebind_step step)
+{
+ if (!pol)
+ return;
+ if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+ nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+ return;
+
+ if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
+ return;
+
+ if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
+ BUG();
+
+ if (step == MPOL_REBIND_STEP1)
+ pol->flags |= MPOL_F_REBINDING;
+ else if (step == MPOL_REBIND_STEP2)
+ pol->flags &= ~MPOL_F_REBINDING;
+ else if (step >= MPOL_REBIND_NSTEP)
+ BUG();
+
+ mpol_ops[pol->mode].rebind(pol, newmask, step);
+}
+
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ *
+ * Called with task's alloc_lock held.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+ enum mpol_rebind_step step)
+{
+ mpol_rebind_policy(tsk->mempolicy, new, step);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ */
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+ up_write(&mm->mmap_sem);
+}
+
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+ [MPOL_DEFAULT] = {
+ .rebind = mpol_rebind_default,
+ },
+ [MPOL_INTERLEAVE] = {
+ .create = mpol_new_interleave,
+ .rebind = mpol_rebind_nodemask,
+ },
+ [MPOL_PREFERRED] = {
+ .create = mpol_new_preferred,
+ .rebind = mpol_rebind_preferred,
+ },
+ [MPOL_BIND] = {
+ .create = mpol_new_bind,
+ .rebind = mpol_rebind_nodemask,
+ },
+};
+
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
-/* Scan through pages checking if pages follow certain conditions. */
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+/*
+ * Scan through pages checking if pages follow certain conditions,
+ * and move them to the pagelist if they do.
+ */
+static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -219,7 +495,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
struct page *page;
- unsigned int nid;
+ int nid;
if (!pte_present(*pte))
continue;
@@ -227,15 +503,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (!page)
continue;
/*
- * The check for PageReserved here is important to avoid
- * handling zero pages and other pages that may have been
- * marked special by the system.
- *
- * If the PageReserved would not be checked here then f.e.
- * the location of the zero page could have an influence
- * on MPOL_MF_STRICT, zero pages would be counted for
- * the per node stats, and there would be useless attempts
- * to put zero pages on the migration list.
+ * vm_normal_page() filters out zero pages, but there might
+ * still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
continue;
@@ -243,9 +512,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
continue;
- if (flags & MPOL_MF_STATS)
- gather_stats(page, private, pte_dirty(*pte));
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
break;
@@ -254,7 +521,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return addr != end;
}
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+ pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
+ void *private)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ int nid;
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+ entry = huge_ptep_get((pte_t *)pmd);
+ if (!pte_present(entry))
+ goto unlock;
+ page = pte_page(entry);
+ nid = page_to_nid(page);
+ if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ goto unlock;
+ /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+ if (flags & (MPOL_MF_MOVE_ALL) ||
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
+ isolate_huge_page(page, private);
+unlock:
+ spin_unlock(ptl);
+#else
+ BUG();
+#endif
+}
+
+static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -265,16 +561,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (!pmd_present(*pmd))
+ continue;
+ if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
+ queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
+ flags, private);
+ continue;
+ }
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- if (check_pte_range(vma, pmd, addr, next, nodes,
+ if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
flags, private))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -285,16 +589,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(vma, pud, addr, next, nodes,
+ if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
flags, private))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
-static inline int check_pgd_range(struct vm_area_struct *vma,
+static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
const nodemask_t *nodes, unsigned long flags,
void *private)
@@ -307,200 +613,263 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(vma, pgd, addr, next, nodes,
+ if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
flags, private))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
}
-/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
- if (vma->vm_flags & (
- VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
- return 0;
- return 1;
+ int nr_updated;
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
/*
- * Check if all pages in a range are on a set of nodes.
- * If pagelist != NULL then isolate pages from the LRU and
- * put them on the pagelist.
+ * Walk through page tables and collect pages to be migrated.
+ *
+ * If pages found in a given range are on a set of nodes (determined by
+ * @nodes and @flags,) it's isolated and queued to the pagelist which is
+ * passed via @private.)
*/
-static struct vm_area_struct *
-check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+static int
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
- int err;
- struct vm_area_struct *first, *vma, *prev;
+ int err = 0;
+ struct vm_area_struct *vma, *prev;
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ return -EFAULT;
+ prev = NULL;
+ for (; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
- err = migrate_prep();
- if (err)
- return ERR_PTR(err);
- }
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
- prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
+ }
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
- err = check_pgd_range(vma, start, endvma, nodes,
+ vma_migratable(vma))) {
+
+ err = queue_pages_pgd_range(vma, start, endvma, nodes,
flags, private);
- if (err) {
- first = ERR_PTR(err);
+ if (err)
break;
- }
}
+next:
prev = vma;
}
- return first;
+ return err;
}
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
{
- int err = 0;
- struct mempolicy *old = vma->vm_policy;
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
- PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy)
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
- if (!err) {
- mpol_get(new);
- vma->vm_policy = new;
- mpol_free(old);
+ if (err)
+ goto err_out;
}
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
return err;
}
/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct mempolicy *new)
+static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *next;
- int err;
-
- err = 0;
- for (; vma && vma->vm_start < end; vma = next) {
- next = vma->vm_next;
- if (vma->vm_start < start)
- err = split_vma(vma->vm_mm, vma, start, 1);
- if (!err && vma->vm_end > end)
- err = split_vma(vma->vm_mm, vma, end, 0);
- if (!err)
- err = policy_vma(vma, new);
- if (err)
- break;
- }
- return err;
-}
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ int err = 0;
+ pgoff_t pgoff;
+ unsigned long vmstart;
+ unsigned long vmend;
-static int contextualize_policy(int mode, nodemask_t *nodes)
-{
- if (!nodes)
- return 0;
+ vma = find_vma(mm, start);
+ if (!vma || vma->vm_start > start)
+ return -EFAULT;
- cpuset_update_task_memory_state();
- if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
- return -EINVAL;
- return mpol_check_policy(mode, nodes);
-}
+ prev = vma->vm_prev;
+ if (start > vma->vm_start)
+ prev = vma;
+ for (; vma && vma->vm_start < end; prev = vma, vma = next) {
+ next = vma->vm_next;
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
+ pgoff = vma->vm_pgoff +
+ ((vmstart - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol);
+ if (prev) {
+ vma = prev;
+ next = vma->vm_next;
+ if (mpol_equal(vma_policy(vma), new_pol))
+ continue;
+ /* vma_merge() joined vma && vma->next, case 8 */
+ goto replace;
+ }
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ if (err)
+ goto out;
+ }
+ if (vma->vm_end != vmend) {
+ err = split_vma(vma->vm_mm, vma, vmend, 0);
+ if (err)
+ goto out;
+ }
+ replace:
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
+ }
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
+ out:
+ return err;
}
/* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+ nodemask_t *nodes)
{
- struct mempolicy *new;
+ struct mempolicy *new, *old;
+ struct mm_struct *mm = current->mm;
+ NODEMASK_SCRATCH(scratch);
+ int ret;
- if (contextualize_policy(mode, nodes))
- return -EINVAL;
- new = mpol_new(mode, nodes);
- if (IS_ERR(new))
- return PTR_ERR(new);
- mpol_free(current->mempolicy);
+ if (!scratch)
+ return -ENOMEM;
+
+ new = mpol_new(mode, flags, nodes);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto out;
+ }
+ /*
+ * prevent changing our mempolicy while show_numa_maps()
+ * is using it.
+ * Note: do_set_mempolicy() can be called at init time
+ * with no 'mm'.
+ */
+ if (mm)
+ down_write(&mm->mmap_sem);
+ task_lock(current);
+ ret = mpol_set_nodemask(new, nodes, scratch);
+ if (ret) {
+ task_unlock(current);
+ if (mm)
+ up_write(&mm->mmap_sem);
+ mpol_put(new);
+ goto out;
+ }
+ old = current->mempolicy;
current->mempolicy = new;
- mpol_set_task_struct_flag();
- if (new && new->policy == MPOL_INTERLEAVE)
+ if (new && new->mode == MPOL_INTERLEAVE &&
+ nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
- return 0;
+ task_unlock(current);
+ if (mm)
+ up_write(&mm->mmap_sem);
+
+ mpol_put(old);
+ ret = 0;
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+ return ret;
}
-/* Fill a zone bitmap for a policy */
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+/*
+ * Return nodemask for policy for get_mempolicy() query
+ *
+ * Called with task's alloc_lock held
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
- int i;
-
nodes_clear(*nodes);
- switch (p->policy) {
+ if (p == &default_policy)
+ return;
+
+ switch (p->mode) {
case MPOL_BIND:
- for (i = 0; p->v.zonelist->zones[i]; i++)
- node_set(zone_to_nid(p->v.zonelist->zones[i]),
- *nodes);
- break;
- case MPOL_DEFAULT:
- break;
+ /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
- /* or use current node instead of online map? */
- if (p->v.preferred_node < 0)
- *nodes = node_online_map;
- else
+ if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes);
+ /* else return empty node mask for local allocation */
break;
default:
BUG();
@@ -521,18 +890,34 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
}
/* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
- unsigned long addr, unsigned long flags)
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_task_memory_state();
- if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+ if (flags &
+ ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
+
+ if (flags & MPOL_F_MEMS_ALLOWED) {
+ if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+ return -EINVAL;
+ *policy = 0; /* just so it's initialized */
+ task_lock(current);
+ *nmask = cpuset_current_mems_allowed;
+ task_unlock(current);
+ return 0;
+ }
+
if (flags & MPOL_F_ADDR) {
+ /*
+ * Do NOT fall back to task policy if the
+ * vma/shared policy at addr is NULL. We
+ * want to return MPOL_DEFAULT in this case.
+ */
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
@@ -547,7 +932,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
return -EINVAL;
if (!pol)
- pol = &default_policy;
+ pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
@@ -556,14 +941,21 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
- pol->policy == MPOL_INTERLEAVE) {
+ pol->mode == MPOL_INTERLEAVE) {
*policy = current->il_next;
} else {
err = -EINVAL;
goto out;
}
- } else
- *policy = pol->policy;
+ } else {
+ *policy = pol == &default_policy ? MPOL_DEFAULT :
+ pol->mode;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
+ }
if (vma) {
up_read(&current->mm->mmap_sem);
@@ -571,10 +963,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
err = 0;
- if (nmask)
- get_zonemask(pol, nmask);
+ if (nmask) {
+ if (mpol_store_user_nodemask(pol)) {
+ *nmask = pol->w.user_nodemask;
+ } else {
+ task_lock(current);
+ get_policy_nodemask(pol, nmask);
+ task_unlock(current);
+ }
+ }
out:
+ mpol_cond_put(pol);
if (vma)
up_read(&current->mm->mmap_sem);
return err;
@@ -590,20 +990,30 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
- isolate_lru_page(page, pagelist);
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (!isolate_lru_page(page)) {
+ list_add_tail(&page->lru, pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ }
+ }
}
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_node(node, GFP_HIGHUSER, 0);
+ if (PageHuge(page))
+ return alloc_huge_page_node(page_hstate(compound_head(page)),
+ node);
+ else
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
-int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
@@ -612,11 +1022,21 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ /*
+ * This does not "check" the range but isolates all pages that
+ * need migration. Between passing in the full user address
+ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ */
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+ queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest);
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages(&pagelist, new_node_page, NULL, dest,
+ MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(&pagelist);
+ }
return err;
}
@@ -627,59 +1047,82 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
*
* Returns the number of page that could not be moved.
*/
-int do_migrate_pages(struct mm_struct *mm,
- const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
{
- LIST_HEAD(pagelist);
int busy = 0;
- int err = 0;
+ int err;
nodemask_t tmp;
- down_read(&mm->mmap_sem);
+ err = migrate_prep();
+ if (err)
+ return err;
+
+ down_read(&mm->mmap_sem);
- err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ err = migrate_vmas(mm, from, to, flags);
if (err)
goto out;
-/*
- * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
- * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
- * bit in 'tmp', and return that <source, dest> pair for migration.
- * The pair of nodemasks 'to' and 'from' define the map.
- *
- * If no pair of bits is found that way, fallback to picking some
- * pair of 'source' and 'dest' bits that are not the same. If the
- * 'source' and 'dest' bits are the same, this represents a node
- * that will be migrating to itself, so no pages need move.
- *
- * If no bits are left in 'tmp', or if all remaining bits left
- * in 'tmp' correspond to the same bit in 'to', return false
- * (nothing left to migrate).
- *
- * This lets us pick a pair of nodes to migrate between, such that
- * if possible the dest node is not already occupied by some other
- * source node, minimizing the risk of overloading the memory on a
- * node that would happen if we migrated incoming memory to a node
- * before migrating outgoing memory source that same node.
- *
- * A single scan of tmp is sufficient. As we go, we remember the
- * most recent <s, d> pair that moved (s != d). If we find a pair
- * that not only moved, but what's better, moved to an empty slot
- * (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
- * most recent <s, d> pair that moved. If we get all the way through
- * the scan of tmp without finding any node that moved, much less
- * moved to an empty node, then there is nothing left worth migrating.
- */
+ /*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scanning from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
- tmp = *from_nodes;
+ tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
- int source = -1;
+ int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
- d = node_remap(s, *from_nodes, *to_nodes);
+
+ /*
+ * do_migrate_pages() tries to maintain the relative
+ * node relationship of the pages established between
+ * threads and memory areas.
+ *
+ * However if the number of source nodes is not equal to
+ * the number of destination nodes we can not preserve
+ * this node relative relationship. In that case, skip
+ * copying memory from a node that is in the destination
+ * mask.
+ *
+ * Example: [2,3,4] -> [3,4,5] moves everything.
+ * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+ */
+
+ if ((nodes_weight(*from) != nodes_weight(*to)) &&
+ (node_isset(s, *to)))
+ continue;
+
+ d = node_remap(s, *from, *to);
if (s == d)
continue;
@@ -690,7 +1133,7 @@ int do_migrate_pages(struct mm_struct *mm,
if (!node_isset(dest, tmp))
break;
}
- if (source == -1)
+ if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
@@ -708,11 +1151,34 @@ out:
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+/*
+ * Allocate a new page for page migration based on vma policy.
+ * Start by assuming the page is mapped by the same vma as contains @start.
+ * Search forward from there, if not. N.B., this assumes that the
+ * list of pages handed to migrate_pages()--which is how we get here--
+ * is in virtual address order.
+ */
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ struct vm_area_struct *vma;
+ unsigned long uninitialized_var(address);
- return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+ vma = find_vma(current->mm, start);
+ while (vma) {
+ address = page_address_in_vma(page, vma);
+ if (address != -EFAULT)
+ break;
+ vma = vma->vm_next;
+ }
+
+ if (PageHuge(page)) {
+ BUG_ON(!vma);
+ return alloc_huge_page_noerr(vma, address, 1);
+ }
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
}
#else
@@ -721,31 +1187,29 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
{
}
-int do_migrate_pages(struct mm_struct *mm,
- const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to, int flags)
{
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
return NULL;
}
#endif
-long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask, unsigned long flags)
+static long do_mbind(unsigned long start, unsigned long len,
+ unsigned short mode, unsigned short mode_flags,
+ nodemask_t *nmask, unsigned long flags)
{
- struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
LIST_HEAD(pagelist);
- if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- || mode > MPOL_MAX)
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -764,13 +1228,13 @@ long do_mbind(unsigned long start, unsigned long len,
if (end == start)
return 0;
- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
-
- new = mpol_new(mode, nmask);
+ new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -778,29 +1242,56 @@ long do_mbind(unsigned long start, unsigned long len,
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
+ pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
+ start, start + len, mode, mode_flags,
+ nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
- down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nmask,
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ goto mpol_out;
+ }
+ {
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ down_write(&mm->mmap_sem);
+ task_lock(current);
+ err = mpol_set_nodemask(new, nmask, scratch);
+ task_unlock(current);
+ if (err)
+ up_write(&mm->mmap_sem);
+ } else
+ err = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ }
+ if (err)
+ goto mpol_out;
+
+ err = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
+ if (!err)
+ err = mbind_range(mm, start, end, new);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
+ if (!err) {
int nr_failed = 0;
- err = mbind_range(vma, start, end, new);
-
- if (!list_empty(&pagelist))
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma);
+ if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
+ nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ if (nr_failed)
+ putback_movable_pages(&pagelist);
+ }
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
- }
+ } else
+ putback_movable_pages(&pagelist);
up_write(&mm->mmap_sem);
- mpol_free(new);
+ mpol_out:
+ mpol_put(new);
return err;
}
@@ -871,66 +1362,85 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned, flags)
{
nodemask_t nodes;
int err;
+ unsigned short mode_flags;
+ mode_flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if (mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((mode_flags & MPOL_F_STATIC_NODES) &&
+ (mode_flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, &nodes, flags);
+ return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
+ unsigned long, maxnode)
{
int err;
nodemask_t nodes;
+ unsigned short flags;
- if (mode < 0 || mode > MPOL_MAX)
+ flags = mode & MPOL_MODE_FLAGS;
+ mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)mode >= MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, &nodes);
+ return do_set_mempolicy(mode, flags, &nodes);
}
-asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
- const unsigned long __user *old_nodes,
- const unsigned long __user *new_nodes)
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
{
- struct mm_struct *mm;
+ const struct cred *cred = current_cred(), *tcred;
+ struct mm_struct *mm = NULL;
struct task_struct *task;
- nodemask_t old;
- nodemask_t new;
nodemask_t task_nodes;
int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
- err = get_nodes(&old, old_nodes, maxnode);
+ if (!scratch)
+ return -ENOMEM;
+
+ old = &scratch->mask1;
+ new = &scratch->mask2;
+
+ err = get_nodes(old, old_nodes, maxnode);
if (err)
- return err;
+ goto out;
- err = get_nodes(&new, new_nodes, maxnode);
+ err = get_nodes(new, new_nodes, maxnode);
if (err)
- return err;
+ goto out;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
- task = pid ? find_task_by_pid(pid) : current;
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
- return -ESRCH;
+ rcu_read_unlock();
+ err = -ESRCH;
+ goto out;
}
- mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
+ get_task_struct(task);
- if (!mm)
- return -EINVAL;
+ err = -EINVAL;
/*
* Check if this process has the right to modify the specified
@@ -938,39 +1448,63 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- if ((current->euid != task->suid) && (current->euid != task->uid) &&
- (current->uid != task->suid) && (current->uid != task->uid) &&
+ tcred = __task_cred(task);
+ if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
!capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
- goto out;
+ goto out_put;
}
+ rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
- goto out;
+ goto out_put;
+ }
+
+ if (!nodes_subset(*new, node_states[N_MEMORY])) {
+ err = -EINVAL;
+ goto out_put;
}
err = security_task_movememory(task);
if (err)
+ goto out_put;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ err = -EINVAL;
goto out;
+ }
- err = do_migrate_pages(mm, &old, &new,
+ err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
-out:
+
mmput(mm);
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+
return err;
+
+out_put:
+ put_task_struct(task);
+ goto out;
+
}
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
{
- int err, pval;
+ int err;
+ int uninitialized_var(pval);
nodemask_t nodes;
if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -992,10 +1526,10 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
#ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_get_mempolicy(int __user *policy,
- compat_ulong_t __user *nmask,
- compat_ulong_t maxnode,
- compat_ulong_t addr, compat_ulong_t flags)
+COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode,
+ compat_ulong_t, addr, compat_ulong_t, flags)
{
long err;
unsigned long __user *nm = NULL;
@@ -1011,7 +1545,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
if (!err && nmask) {
- err = copy_from_user(bm, nm, alloc_size);
+ unsigned long copy_size;
+ copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+ err = copy_from_user(bm, nm, copy_size);
/* ensure entire bitmap is zeroed */
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
err |= compat_put_bitmap(nmask, bm, nr_bits);
@@ -1020,8 +1556,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
return err;
}
-asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode)
+COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1043,9 +1579,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
-asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
- compat_ulong_t mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode, compat_ulong_t flags)
+COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
+ compat_ulong_t, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode, compat_ulong_t, flags)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1069,51 +1605,133 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
#endif
-/* Return effective policy for a VMA */
-static struct mempolicy * get_vma_policy(struct task_struct *task,
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task: task for fallback if vma policy == default
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Current or other task's task mempolicy and non-shared vma policies must be
+ * protected by task_lock(task) by the caller.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy)
- pol = vma->vm_ops->get_policy(vma, addr);
- else if (vma->vm_policy &&
- vma->vm_policy->policy != MPOL_DEFAULT)
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+ addr);
+ if (vpol)
+ pol = vpol;
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
return pol;
}
-/* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+ struct mempolicy *pol = get_task_policy(task);
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
+ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
+ }
+ }
+
+ if (!pol)
+ return default_policy.flags & MPOL_F_MOF;
+
+ return pol->flags & MPOL_F_MOF;
+}
+
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
- int nd;
+ enum zone_type dynamic_policy_zone = policy_zone;
+
+ BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+
+ /*
+ * if policy->v.nodes has movable memory only,
+ * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+ *
+ * policy->v.nodes is intersect with node_states[N_MEMORY].
+ * so if the following test faile, it implies
+ * policy->v.nodes has movable memory only.
+ */
+ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ dynamic_policy_zone = ZONE_MOVABLE;
+
+ return zone >= dynamic_policy_zone;
+}
- switch (policy->policy) {
+/*
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+{
+ /* Lower zones don't get a nodemask applied for MPOL_BIND */
+ if (unlikely(policy->mode == MPOL_BIND) &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+ return &policy->v.nodes;
+
+ return NULL;
+}
+
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+ int nd)
+{
+ switch (policy->mode) {
case MPOL_PREFERRED:
- nd = policy->v.preferred_node;
- if (nd < 0)
- nd = numa_node_id();
+ if (!(policy->flags & MPOL_F_LOCAL))
+ nd = policy->v.preferred_node;
break;
case MPOL_BIND:
- /* Lower zones don't get a policy applied */
- /* Careful: current->mems_allowed might have moved */
- if (gfp_zone(gfp) >= policy_zone)
- if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
- return policy->v.zonelist;
- /*FALL THROUGH*/
- case MPOL_INTERLEAVE: /* should not happen */
- case MPOL_DEFAULT:
- nd = numa_node_id();
+ /*
+ * Normally, MPOL_BIND allocations are node-local within the
+ * allowed nodemask. However, if __GFP_THISNODE is set and the
+ * current node isn't part of the mask, we use the zonelist for
+ * the first node in the mask instead.
+ */
+ if (unlikely(gfp & __GFP_THISNODE) &&
+ unlikely(!node_isset(nd, policy->v.nodes)))
+ nd = first_node(policy->v.nodes);
break;
default:
- nd = 0;
BUG();
}
- return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
+ return node_zonelist(nd, gfp);
}
/* Do dynamic interleaving for a process */
@@ -1126,7 +1744,8 @@ static unsigned interleave_nodes(struct mempolicy *policy)
next = next_node(nid, policy->v.nodes);
if (next >= MAX_NUMNODES)
next = first_node(policy->v.nodes);
- me->il_next = next;
+ if (next < MAX_NUMNODES)
+ me->il_next = next;
return nid;
}
@@ -1134,28 +1753,45 @@ static unsigned interleave_nodes(struct mempolicy *policy)
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
-unsigned slab_node(struct mempolicy *policy)
+unsigned int mempolicy_slab_node(void)
{
- int pol = policy ? policy->policy : MPOL_DEFAULT;
+ struct mempolicy *policy;
+ int node = numa_mem_id();
+
+ if (in_interrupt())
+ return node;
+
+ policy = current->mempolicy;
+ if (!policy || policy->flags & MPOL_F_LOCAL)
+ return node;
+
+ switch (policy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * handled MPOL_F_LOCAL above
+ */
+ return policy->v.preferred_node;
- switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
- case MPOL_BIND:
+ case MPOL_BIND: {
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
- return zone_to_nid(policy->v.zonelist->zones[0]);
-
- case MPOL_PREFERRED:
- if (policy->v.preferred_node >= 0)
- return policy->v.preferred_node;
- /* Fall through */
+ struct zonelist *zonelist;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+ zonelist = &NODE_DATA(node)->node_zonelists[0];
+ (void)first_zones_zonelist(zonelist, highest_zoneidx,
+ &policy->v.nodes,
+ &zone);
+ return zone ? zone->node : node;
+ }
default:
- return numa_node_id();
+ BUG();
}
}
@@ -1164,10 +1800,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
- unsigned target = (unsigned)off % nnodes;
+ unsigned target;
int c;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
+ if (!nnodes)
+ return numa_node_id();
+ target = (unsigned int)off % nnodes;
c = 0;
do {
nid = next_node(nid, pol->v.nodes);
@@ -1198,22 +1837,151 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+ int w, bit = NUMA_NO_NODE;
+
+ w = nodes_weight(*maskp);
+ if (w)
+ bit = bitmap_ord_to_pos(maskp->bits,
+ get_random_int() % w, MAX_NUMNODES);
+ return bit;
+}
+
#ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags: for requested zone
+ * @mpol: pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ *
+ * Returns a zonelist suitable for a huge page allocation and a pointer
+ * to the struct mempolicy for conditional unref after allocation.
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
+ * @nodemask for filtering the zonelist.
+ *
+ * Must be protected by read_mems_allowed_begin()
+ */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags, struct mempolicy **mpol,
+ nodemask_t **nodemask)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
- if (pol->policy == MPOL_INTERLEAVE) {
- unsigned nid;
+ *mpol = get_vma_policy(current, vma, addr);
+ *nodemask = NULL; /* assume !MPOL_BIND */
+
+ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+ zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+ huge_page_shift(hstate_vma(vma))), gfp_flags);
+ } else {
+ zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
+ if ((*mpol)->mode == MPOL_BIND)
+ *nodemask = &(*mpol)->v.nodes;
+ }
+ return zl;
+}
- nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy. Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ int nid;
+
+ if (!(mask && current->mempolicy))
+ return false;
+
+ task_lock(current);
+ mempolicy = current->mempolicy;
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ if (mempolicy->flags & MPOL_F_LOCAL)
+ nid = numa_node_id();
+ else
+ nid = mempolicy->v.preferred_node;
+ init_nodemask_of_node(mask, nid);
+ break;
+
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ *mask = mempolicy->v.nodes;
+ break;
+
+ default:
+ BUG();
}
- return zonelist_policy(GFP_HIGHUSER, pol);
+ task_unlock(current);
+
+ return true;
}
#endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -1222,15 +1990,15 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
struct zonelist *zl;
struct page *page;
- zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
+ zl = node_zonelist(nid, gfp);
page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0])
+ if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
/**
- * alloc_page_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
@@ -1239,6 +2007,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
+ * @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
@@ -1252,19 +2021,36 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, int node)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct mempolicy *pol;
+ struct page *page;
+ unsigned int cpuset_mems_cookie;
- cpuset_update_task_memory_state();
+retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+ cpuset_mems_cookie = read_mems_allowed_begin();
- if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
- return alloc_page_interleave(gfp, 0, nid);
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
}
- return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol, node),
+ policy_nodemask(gfp, pol));
+ if (unlikely(mpol_needs_cond_ref(pol)))
+ __mpol_put(pol);
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+ return page;
}
/**
@@ -1288,90 +2074,108 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
+ struct page *page;
+ unsigned int cpuset_mems_cookie;
- if ((gfp & __GFP_WAIT) && !in_interrupt())
- cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (pol->mode == MPOL_INTERLEAVE)
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
+
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
+ return page;
}
EXPORT_SYMBOL(alloc_pages_current);
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+ struct mempolicy *pol = mpol_dup(vma_policy(src));
+
+ if (IS_ERR(pol))
+ return PTR_ERR(pol);
+ dst->vm_policy = pol;
+ return 0;
+}
+
/*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
*/
-void *cpuset_being_rebound;
-/* Slow path of a mempolicy copy */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* task's mempolicy is protected by alloc_lock */
+ if (old == current->mempolicy) {
+ task_lock(current);
+ *new = *old;
+ task_unlock(current);
+ } else
+ *new = *old;
+
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- mpol_rebind_policy(old, &mems);
+ if (new->flags & MPOL_F_REBINDING)
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
+ else
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- *new = *old;
atomic_set(&new->refcnt, 1);
- if (new->policy == MPOL_BIND) {
- int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
- if (!new->v.zonelist) {
- kmem_cache_free(policy_cache, new);
- return ERR_PTR(-ENOMEM);
- }
- memcpy(new->v.zonelist, old->v.zonelist, sz);
- }
return new;
}
/* Slow path of a mempolicy comparison */
-int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
- return 0;
- if (a->policy != b->policy)
- return 0;
- switch (a->policy) {
- case MPOL_DEFAULT:
- return 1;
+ return false;
+ if (a->mode != b->mode)
+ return false;
+ if (a->flags != b->flags)
+ return false;
+ if (mpol_store_user_nodemask(a))
+ if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+ return false;
+
+ switch (a->mode) {
+ case MPOL_BIND:
+ /* Fall through */
case MPOL_INTERLEAVE:
- return nodes_equal(a->v.nodes, b->v.nodes);
+ return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
- case MPOL_BIND: {
- int i;
- for (i = 0; a->v.zonelist->zones[i]; i++)
- if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
- return 0;
- return b->v.zonelist->zones[i] == NULL;
- }
default:
BUG();
- return 0;
+ return false;
}
}
-/* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
-{
- if (!atomic_dec_and_test(&p->refcnt))
- return;
- if (p->policy == MPOL_BIND)
- kfree(p->v.zonelist);
- p->policy = MPOL_DEFAULT;
- kmem_cache_free(policy_cache, p);
-}
-
/*
* Shared memory backing store policy support.
*
@@ -1433,8 +2237,8 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
- new->policy ? new->policy->policy : 0);
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
+ new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
@@ -1456,25 +2260,132 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page: page to be checked
+ * @vma: vm area where page mapped
+ * @addr: virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = cpu_to_node(thiscpu);
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ polnid = thisnid;
+
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- PDprintk("deleting %lx-l%x\n", n->start, n->end);
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_free(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
-struct sp_node *
-sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+static void sp_node_init(struct sp_node *node, unsigned long start,
+ unsigned long end, struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ node->start = start;
+ node->end = end;
+ node->policy = pol;
+}
+
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+ struct mempolicy *pol)
+{
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
- n->start = start;
- n->end = end;
- mpol_get(pol);
- n->policy = pol;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+ sp_node_init(n, start, end, newpol);
+
return n;
}
@@ -1482,7 +2393,10 @@ sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ struct sp_node *n_new = NULL;
+ struct mempolicy *mpol_new = NULL;
+ int ret = 0;
restart:
spin_lock(&sp->lock);
@@ -1498,16 +2412,16 @@ restart:
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
- if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
- }
+ if (!n_new)
+ goto alloc_new;
+
+ *mpol_new = *n->policy;
+ atomic_set(&mpol_new->refcnt, 1);
+ sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
- sp_insert(sp, new2);
- new2 = NULL;
+ sp_insert(sp, n_new);
+ n_new = NULL;
+ mpol_new = NULL;
break;
} else
n->end = start;
@@ -1519,34 +2433,74 @@ restart:
if (new)
sp_insert(sp, new);
spin_unlock(&sp->lock);
- if (new2) {
- mpol_free(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
-}
+ ret = 0;
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
- nodemask_t *policy_nodes)
-{
- info->root = RB_ROOT;
- spin_lock_init(&info->lock);
+err_out:
+ if (mpol_new)
+ mpol_put(mpol_new);
+ if (n_new)
+ kmem_cache_free(sn_cache, n_new);
- if (policy != MPOL_DEFAULT) {
- struct mempolicy *newpol;
+ return ret;
- /* Falls back to MPOL_DEFAULT on any error */
- newpol = mpol_new(policy, policy_nodes);
- if (!IS_ERR(newpol)) {
- /* Create pseudo-vma that contains just the policy */
- struct vm_area_struct pvma;
+alloc_new:
+ spin_unlock(&sp->lock);
+ ret = -ENOMEM;
+ n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n_new)
+ goto err_out;
+ mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!mpol_new)
+ goto err_out;
+ goto restart;
+}
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- /* Policy covers entire file */
- pvma.vm_end = TASK_SIZE;
- mpol_set_shared_policy(info, &pvma, newpol);
- mpol_free(newpol);
- }
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol: struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+ int ret;
+
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+ spin_lock_init(&sp->lock);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+ struct mempolicy *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ goto put_mpol;
+ /* contextualize the tmpfs mount point mempolicy */
+ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ if (IS_ERR(new))
+ goto free_scratch; /* no valid nodemask intersection */
+
+ task_lock(current);
+ ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+ task_unlock(current);
+ if (ret)
+ goto put_new;
+
+ /* Create pseudo-vma that contains just the policy */
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ pvma.vm_end = TASK_SIZE; /* policy covers entire file */
+ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_new:
+ mpol_put(new); /* drop initial ref */
+free_scratch:
+ NODEMASK_SCRATCH_FREE(scratch);
+put_mpol:
+ mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
@@ -1557,10 +2511,11 @@ int mpol_set_shared_policy(struct shared_policy *info,
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
- sz, npol? npol->policy : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : -1);
+ sz, npol ? npol->mode : -1,
+ npol ? npol->flags : -1,
+ npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1569,7 +2524,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
@@ -1586,325 +2541,328 @@ void mpol_free_shared_policy(struct shared_policy *p)
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_free(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
spin_unlock(&p->lock);
}
-/* assumes fs == KERNEL_DS */
-void __init numa_policy_init(void)
+#ifdef CONFIG_NUMA_BALANCING
+static int __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
{
- policy_cache = kmem_cache_create("numa_policy",
- sizeof(struct mempolicy),
- 0, SLAB_PANIC, NULL, NULL);
+ bool numabalancing_default = false;
- sn_cache = kmem_cache_create("shared_policy_node",
- sizeof(struct sp_node),
- 0, SLAB_PANIC, NULL, NULL);
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
- /* Set interleaving policy for system init. This way not all
- the data structures allocated at system boot end up in node zero. */
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
- if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
- printk("numa_policy_init: interleaving failed\n");
+ if (nr_node_ids > 1 && !numabalancing_override) {
+ pr_info("%s automatic NUMA balancing. "
+ "Configure with numa_balancing= or the "
+ "kernel.numa_balancing sysctl",
+ numabalancing_default ? "Enabling" : "Disabling");
+ set_numabalancing_state(numabalancing_default);
+ }
}
-/* Reset policy of current process to default */
-void numa_default_policy(void)
+static int __init setup_numabalancing(char *str)
{
- do_set_mempolicy(MPOL_DEFAULT, NULL);
+ int ret = 0;
+ if (!str)
+ goto out;
+
+ if (!strcmp(str, "enable")) {
+ numabalancing_override = 1;
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ numabalancing_override = -1;
+ ret = 1;
+ }
+out:
+ if (!ret)
+ pr_warn("Unable to parse numa_balancing=\n");
+
+ return ret;
}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
-/* Migrate a policy to a different set of nodes */
-void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
{
- nodemask_t *mpolmask;
- nodemask_t tmp;
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
- if (!pol)
- return;
- mpolmask = &pol->cpuset_mems_allowed;
- if (nodes_equal(*mpolmask, *newmask))
- return;
+ policy_cache = kmem_cache_create("numa_policy",
+ sizeof(struct mempolicy),
+ 0, SLAB_PANIC, NULL);
- switch (pol->policy) {
- case MPOL_DEFAULT:
- break;
- case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
- pol->v.nodes = tmp;
- *mpolmask = *newmask;
- current->il_next = node_remap(current->il_next,
- *mpolmask, *newmask);
- break;
- case MPOL_PREFERRED:
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *mpolmask, *newmask);
- *mpolmask = *newmask;
- break;
- case MPOL_BIND: {
- nodemask_t nodes;
- struct zone **z;
- struct zonelist *zonelist;
+ sn_cache = kmem_cache_create("shared_policy_node",
+ sizeof(struct sp_node),
+ 0, SLAB_PANIC, NULL);
+
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
- nodes_clear(nodes);
- for (z = pol->v.zonelist->zones; *z; z++)
- node_set(zone_to_nid(*z), nodes);
- nodes_remap(tmp, nodes, *mpolmask, *newmask);
- nodes = tmp;
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
- zonelist = bind_zonelist(&nodes);
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
- /* If no mem, then zonelist is NULL and we keep old zonelist.
- * If that old zonelist has no remaining mems_allowed nodes,
- * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
- */
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
- if (zonelist) {
- /* Good - got mem - substitute new zonelist */
- kfree(pol->v.zonelist);
- pol->v.zonelist = zonelist;
- }
- *mpolmask = *newmask;
- break;
- }
- default:
- BUG();
- break;
- }
-}
+ if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
+ pr_err("%s: interleaving failed\n", __func__);
-/*
- * Wrapper for mpol_rebind_policy() that just requires task
- * pointer, and updates task mempolicy.
- */
+ check_numabalancing_enable();
+}
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+/* Reset policy of current process to default */
+void numa_default_policy(void)
{
- mpol_rebind_policy(tsk->mempolicy, new);
+ do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
/*
- * Rebind each vma in mm to new nodemask.
- *
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Parse and format mempolicy from/to strings
*/
-void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
-{
- struct vm_area_struct *vma;
-
- down_write(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
-}
-
/*
- * Display pages allocated per node and memory policy via /proc.
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
+static const char * const policy_modes[] =
+{
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "prefer",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local",
+};
-static const char *policy_types[] = { "default", "prefer", "bind",
- "interleave" };
-/*
- * Convert a mempolicy into a string.
- * Returns the number of characters in buffer (if positive)
- * or an error (negative)
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
+ * @str: string containing mempolicy to parse
+ * @mpol: pointer to struct mempolicy pointer, returned on success.
+ *
+ * Format of input:
+ * <mode>[=<flags>][:<nodelist>]
+ *
+ * On success, returns 0, else 1
*/
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_parse_str(char *str, struct mempolicy **mpol)
{
- char *p = buffer;
- int l;
+ struct mempolicy *new = NULL;
+ unsigned short mode;
+ unsigned short mode_flags;
nodemask_t nodes;
- int mode = pol ? pol->policy : MPOL_DEFAULT;
-
- switch (mode) {
- case MPOL_DEFAULT:
+ char *nodelist = strchr(str, ':');
+ char *flags = strchr(str, '=');
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate mode or flags string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, nodes))
+ goto out;
+ if (!nodes_subset(nodes, node_states[N_MEMORY]))
+ goto out;
+ } else
nodes_clear(nodes);
- break;
- case MPOL_PREFERRED:
- nodes_clear(nodes);
- node_set(pol->v.preferred_node, nodes);
- break;
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
- case MPOL_BIND:
- get_zonemask(pol, &nodes);
- break;
+ for (mode = 0; mode < MPOL_MAX; mode++) {
+ if (!strcmp(str, policy_modes[mode])) {
+ break;
+ }
+ }
+ if (mode >= MPOL_MAX)
+ goto out;
+ switch (mode) {
+ case MPOL_PREFERRED:
+ /*
+ * Insist on a nodelist of one node only
+ */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (*rest)
+ goto out;
+ }
+ break;
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
+ if (!nodelist)
+ nodes = node_states[N_MEMORY];
break;
-
- default:
- BUG();
- return -EFAULT;
+ case MPOL_LOCAL:
+ /*
+ * Don't allow a nodelist; mpol_new() checks flags
+ */
+ if (nodelist)
+ goto out;
+ mode = MPOL_PREFERRED;
+ break;
+ case MPOL_DEFAULT:
+ /*
+ * Insist on a empty nodelist
+ */
+ if (!nodelist)
+ err = 0;
+ goto out;
+ case MPOL_BIND:
+ /*
+ * Insist on a nodelist
+ */
+ if (!nodelist)
+ goto out;
}
- l = strlen(policy_types[mode]);
- if (buffer + maxlen < p + l + 1)
- return -ENOSPC;
-
- strcpy(p, policy_types[mode]);
- p += l;
-
- if (!nodes_empty(nodes)) {
- if (buffer + maxlen < p + 2)
- return -ENOSPC;
- *p++ = '=';
- p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+ mode_flags = 0;
+ if (flags) {
+ /*
+ * Currently, we only support two mutually exclusive
+ * mode flags.
+ */
+ if (!strcmp(flags, "static"))
+ mode_flags |= MPOL_F_STATIC_NODES;
+ else if (!strcmp(flags, "relative"))
+ mode_flags |= MPOL_F_RELATIVE_NODES;
+ else
+ goto out;
}
- return p - buffer;
-}
-
-struct numa_maps {
- unsigned long pages;
- unsigned long anon;
- unsigned long active;
- unsigned long writeback;
- unsigned long mapcount_max;
- unsigned long dirty;
- unsigned long swapcache;
- unsigned long node[MAX_NUMNODES];
-};
-
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
- struct numa_maps *md = private;
- int count = page_mapcount(page);
-
- md->pages++;
- if (pte_dirty || PageDirty(page))
- md->dirty++;
- if (PageSwapCache(page))
- md->swapcache++;
-
- if (PageActive(page))
- md->active++;
+ new = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR(new))
+ goto out;
- if (PageWriteback(page))
- md->writeback++;
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
- if (PageAnon(page))
- md->anon++;
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
- if (count > md->mapcount_max)
- md->mapcount_max = count;
+ err = 0;
- md->node[page_to_nid(page)]++;
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ if (flags)
+ *--flags = '=';
+ if (!err)
+ *mpol = new;
+ return err;
}
+#endif /* CONFIG_TMPFS */
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer: to contain formatted mempolicy string
+ * @maxlen: length of @buffer
+ * @pol: pointer to mempolicy to be formatted
+ *
+ * Convert @pol into a string. If @buffer is too short, truncate the string.
+ * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
+ * longest flag, "relative", and to display at least a few node ids.
+ */
+void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
- unsigned long addr;
- struct page *page;
-
- for (addr = start; addr < end; addr += HPAGE_SIZE) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
- pte_t pte;
-
- if (!ptep)
- continue;
-
- pte = *ptep;
- if (pte_none(pte))
- continue;
-
- page = pte_page(pte);
- if (!page)
- continue;
+ char *p = buffer;
+ nodemask_t nodes = NODE_MASK_NONE;
+ unsigned short mode = MPOL_DEFAULT;
+ unsigned short flags = 0;
- gather_stats(page, md, pte_dirty(*ptep));
+ if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
+ mode = pol->mode;
+ flags = pol->flags;
}
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct numa_maps *md)
-{
-}
-#endif
-int show_numa_map(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct vm_area_struct *vma = v;
- struct numa_maps *md;
- struct file *file = vma->vm_file;
- struct mm_struct *mm = vma->vm_mm;
- int n;
- char buffer[50];
-
- if (!mm)
- return 0;
-
- md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
- if (!md)
- return 0;
+ switch (mode) {
+ case MPOL_DEFAULT:
+ break;
+ case MPOL_PREFERRED:
+ if (flags & MPOL_F_LOCAL)
+ mode = MPOL_LOCAL;
+ else
+ node_set(pol->v.preferred_node, nodes);
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ nodes = pol->v.nodes;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ snprintf(p, maxlen, "unknown");
+ return;
+ }
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(priv->task, vma, vma->vm_start));
+ p += snprintf(p, maxlen, "%s", policy_modes[mode]);
- seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+ if (flags & MPOL_MODE_FLAGS) {
+ p += snprintf(p, buffer + maxlen - p, "=");
- if (file) {
- seq_printf(m, " file=");
- seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
- seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
+ /*
+ * Currently, the only defined flags are mutually exclusive
+ */
+ if (flags & MPOL_F_STATIC_NODES)
+ p += snprintf(p, buffer + maxlen - p, "static");
+ else if (flags & MPOL_F_RELATIVE_NODES)
+ p += snprintf(p, buffer + maxlen - p, "relative");
}
- if (is_vm_hugetlb_page(vma)) {
- check_huge_range(vma, vma->vm_start, vma->vm_end, md);
- seq_printf(m, " huge");
- } else {
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_online_map, MPOL_MF_STATS, md);
+ if (!nodes_empty(nodes)) {
+ p += snprintf(p, buffer + maxlen - p, ":");
+ p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
-
- if (!md->pages)
- goto out;
-
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
-
- if (md->dirty)
- seq_printf(m," dirty=%lu",md->dirty);
-
- if (md->pages != md->anon && md->pages != md->dirty)
- seq_printf(m, " mapped=%lu", md->pages);
-
- if (md->mapcount_max > 1)
- seq_printf(m, " mapmax=%lu", md->mapcount_max);
-
- if (md->swapcache)
- seq_printf(m," swapcache=%lu", md->swapcache);
-
- if (md->active < md->pages && !is_vm_hugetlb_page(vma))
- seq_printf(m," active=%lu", md->active);
-
- if (md->writeback)
- seq_printf(m," writeback=%lu", md->writeback);
-
- for_each_online_node(n)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
- seq_putc(m, '\n');
- kfree(md);
-
- if (m->count < m->size)
- m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
- return 0;
}
-
diff --git a/mm/mempool.c b/mm/mempool.c
index ccd8cb8cd41..e209c98c720 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
@@ -27,7 +28,15 @@ static void *remove_element(mempool_t *pool)
return pool->elements[--pool->curr_nr];
}
-static void free_pool(mempool_t *pool)
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool: pointer to the memory pool which was allocated via
+ * mempool_create().
+ *
+ * Free all reserved elements in @pool and @pool itself. This function
+ * only sleeps if the free_fn() function sleeps.
+ */
+void mempool_destroy(mempool_t *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
@@ -36,6 +45,7 @@ static void free_pool(mempool_t *pool)
kfree(pool->elements);
kfree(pool);
}
+EXPORT_SYMBOL(mempool_destroy);
/**
* mempool_create - create a memory pool
@@ -46,28 +56,29 @@ static void free_pool(mempool_t *pool)
* @pool_data: optional private data available to the user-defined functions.
*
* this function creates and allocates a guaranteed size, preallocated
- * memory pool. The pool can be used from the mempool_alloc and mempool_free
+ * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
- * functions might sleep - as long as the mempool_alloc function is not called
+ * functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
{
- return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+ return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+ GFP_KERNEL, NUMA_NO_NODE);
}
EXPORT_SYMBOL(mempool_create);
mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
- mempool_free_t *free_fn, void *pool_data, int node_id)
+ mempool_free_t *free_fn, void *pool_data,
+ gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
- pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
+ pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
if (!pool)
return NULL;
- memset(pool, 0, sizeof(*pool));
pool->elements = kmalloc_node(min_nr * sizeof(void *),
- GFP_KERNEL, node_id);
+ gfp_mask, node_id);
if (!pool->elements) {
kfree(pool);
return NULL;
@@ -85,9 +96,9 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
while (pool->curr_nr < pool->min_nr) {
void *element;
- element = pool->alloc(GFP_KERNEL, pool->pool_data);
+ element = pool->alloc(gfp_mask, pool->pool_data);
if (unlikely(!element)) {
- free_pool(pool);
+ mempool_destroy(pool);
return NULL;
}
add_element(pool, element);
@@ -173,32 +184,16 @@ out:
EXPORT_SYMBOL(mempool_resize);
/**
- * mempool_destroy - deallocate a memory pool
- * @pool: pointer to the memory pool which was allocated via
- * mempool_create().
- *
- * this function only sleeps if the free_fn() function sleeps. The caller
- * has to guarantee that all elements have been returned to the pool (ie:
- * freed) prior to calling mempool_destroy().
- */
-void mempool_destroy(mempool_t *pool)
-{
- /* Check for outstanding elements */
- BUG_ON(pool->curr_nr != pool->min_nr);
- free_pool(pool);
-}
-EXPORT_SYMBOL(mempool_destroy);
-
-/**
* mempool_alloc - allocate an element from a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @gfp_mask: the usual allocation bitmask.
*
- * this function only sleeps if the alloc_fn function sleeps or
+ * this function only sleeps if the alloc_fn() function sleeps or
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
*/
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
@@ -207,6 +202,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait;
gfp_t gfp_temp;
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_WAIT);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
@@ -225,28 +221,45 @@ repeat_alloc:
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
+ /* paired with rmb in mempool_free(), read comment there */
+ smp_wmb();
+ /*
+ * Update the allocation stack trace as this is more useful
+ * for debugging.
+ */
+ kmemleak_update_trace(element);
return element;
}
- spin_unlock_irqrestore(&pool->lock, flags);
- /* We must not sleep in the GFP_ATOMIC case */
- if (!(gfp_mask & __GFP_WAIT))
+ /*
+ * We use gfp mask w/o __GFP_WAIT or IO for the first round. If
+ * alloc failed with that and @pool was empty, retry immediately.
+ */
+ if (gfp_temp != gfp_mask) {
+ spin_unlock_irqrestore(&pool->lock, flags);
+ gfp_temp = gfp_mask;
+ goto repeat_alloc;
+ }
+
+ /* We must not sleep if !__GFP_WAIT */
+ if (!(gfp_mask & __GFP_WAIT)) {
+ spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
+ }
- /* Now start performing page reclaim */
- gfp_temp = gfp_mask;
+ /* Let's wait for someone else to return an element to @pool */
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
- smp_mb();
- if (!pool->curr_nr) {
- /*
- * FIXME: this should be io_schedule(). The timeout is there
- * as a workaround for some DM problems in 2.6.18.
- */
- io_schedule_timeout(5*HZ);
- }
- finish_wait(&pool->wait, &wait);
+ spin_unlock_irqrestore(&pool->lock, flags);
+
+ /*
+ * FIXME: this should be io_schedule(). The timeout is there as a
+ * workaround for some DM problems in 2.6.18.
+ */
+ io_schedule_timeout(5*HZ);
+
+ finish_wait(&pool->wait, &wait);
goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc);
@@ -263,10 +276,45 @@ void mempool_free(void *element, mempool_t *pool)
{
unsigned long flags;
- smp_mb();
- if (pool->curr_nr < pool->min_nr) {
+ if (unlikely(element == NULL))
+ return;
+
+ /*
+ * Paired with the wmb in mempool_alloc(). The preceding read is
+ * for @element and the following @pool->curr_nr. This ensures
+ * that the visible value of @pool->curr_nr is from after the
+ * allocation of @element. This is necessary for fringe cases
+ * where @element was passed to this task without going through
+ * barriers.
+ *
+ * For example, assume @p is %NULL at the beginning and one task
+ * performs "p = mempool_alloc(...);" while another task is doing
+ * "while (!p) cpu_relax(); mempool_free(p, ...);". This function
+ * may end up using curr_nr value which is from before allocation
+ * of @p without the following rmb.
+ */
+ smp_rmb();
+
+ /*
+ * For correctness, we need a test which is guaranteed to trigger
+ * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
+ * without locking achieves that and refilling as soon as possible
+ * is desirable.
+ *
+ * Because curr_nr visible here is always a value after the
+ * allocation of @element, any task which decremented curr_nr below
+ * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+ * incremented to min_nr afterwards. If curr_nr gets incremented
+ * to min_nr after the allocation of @element, the elements
+ * allocated after that are subject to the same guarantee.
+ *
+ * Waiters happen iff curr_nr is 0 and the above guarantee also
+ * ensures that there will be frees which return elements to the
+ * pool waking up the waiters.
+ */
+ if (unlikely(pool->curr_nr < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
- if (pool->curr_nr < pool->min_nr) {
+ if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
@@ -297,22 +345,15 @@ EXPORT_SYMBOL(mempool_free_slab);
/*
* A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
- * specfied by pool_data
+ * specified by pool_data
*/
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
- size_t size = (size_t)(long)pool_data;
+ size_t size = (size_t)pool_data;
return kmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);
-void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
-{
- size_t size = (size_t) pool_data;
- return kzalloc(size, gfp_mask);
-}
-EXPORT_SYMBOL(mempool_kzalloc);
-
void mempool_kfree(void *element, void *pool_data)
{
kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 20a8c2687b1..be6dbf995c0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -9,17 +9,19 @@
* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
* Hirokazu Takahashi <taka@valinux.co.jp>
* Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter
*/
#include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
+#include <linux/nsproxy.h>
#include <linux/pagevec.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
@@ -28,45 +30,25 @@
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/memcontrol.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+#include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
+#include <linux/mmu_notifier.h>
-#include "internal.h"
+#include <asm/tlbflush.h>
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
-/*
- * Isolate one page from the LRU lists. If successful put it onto
- * the indicated list with elevated page count.
- *
- * Result:
- * -EBUSY: page not on LRU list
- * 0: page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page, struct list_head *pagelist)
-{
- int ret = -EBUSY;
-
- if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
-
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page)) {
- ret = 0;
- get_page(page);
- ClearPageLRU(page);
- if (PageActive(page))
- del_page_from_active_list(zone, page);
- else
- del_page_from_inactive_list(zone, page);
- list_add_tail(&page->lru, pagelist);
- }
- spin_unlock_irq(&zone->lru_lock);
- }
- return ret;
-}
+#include "internal.h"
/*
* migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page().
+ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
+ * undesirable, use migrate_prep_local()
*/
int migrate_prep(void)
{
@@ -81,159 +63,147 @@ int migrate_prep(void)
return 0;
}
-static inline void move_to_lru(struct page *page)
+/* Do the necessary work of migrate_prep but not if it involves other CPUs */
+int migrate_prep_local(void)
{
- if (PageActive(page)) {
- /*
- * lru_cache_add_active checks that
- * the PG_active bit is off.
- */
- ClearPageActive(page);
- lru_cache_add_active(page);
- } else {
- lru_cache_add(page);
- }
- put_page(page);
+ lru_add_drain();
+
+ return 0;
}
/*
- * Add isolated pages on the list back to the LRU.
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
*
- * returns the number of pages put back.
+ * This function shall be used whenever the isolated pageset has been
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
*/
-int putback_lru_pages(struct list_head *l)
+void putback_movable_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
- int count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
+ if (unlikely(PageHuge(page))) {
+ putback_active_hugepage(page);
+ continue;
+ }
list_del(&page->lru);
- move_to_lru(page);
- count++;
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ if (unlikely(isolated_balloon_page(page)))
+ balloon_page_putback(page);
+ else
+ putback_lru_page(page);
}
- return count;
-}
-
-static inline int is_swap_pte(pte_t pte)
-{
- return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
}
/*
* Restore a potential migration pte to a working pte entry
*/
-static void remove_migration_pte(struct vm_area_struct *vma,
- struct page *old, struct page *new)
+static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
+ unsigned long addr, void *old)
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
- unsigned long addr = page_address_in_vma(new, vma);
-
- if (addr == -EFAULT)
- return;
-
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- return;
+ if (unlikely(PageHuge(new))) {
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep)
+ goto out;
+ ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
+ } else {
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ goto out;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- return;
+ ptep = pte_offset_map(pmd, addr);
- ptep = pte_offset_map(pmd, addr);
+ /*
+ * Peek to check is_swap_pte() before taking ptlock? No, we
+ * can race mremap's move_ptes(), which skips anon_vma lock.
+ */
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- return;
- }
+ ptl = pte_lockptr(mm, pmd);
+ }
- ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
- goto out;
+ goto unlock;
entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
- goto out;
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != old)
+ goto unlock;
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+ if (pte_swp_soft_dirty(*ptep))
+ pte = pte_mksoft_dirty(pte);
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new)) {
+ pte = pte_mkhuge(pte);
+ pte = arch_make_huge_pte(pte, vma, new, 0);
+ }
+#endif
+ flush_dcache_page(new);
set_pte_at(mm, addr, ptep, pte);
- if (PageAnon(new))
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, addr);
+ else
+ page_dup_rmap(new);
+ } else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, pte);
- lazy_mmu_prot_update(pte);
-
-out:
+ update_mmu_cache(vma, addr, ptep);
+unlock:
pte_unmap_unlock(ptep, ptl);
+out:
+ return SWAP_AGAIN;
}
/*
- * Note that remove_file_migration_ptes will only work on regular mappings,
- * Nonlinear mappings do not use migration entries.
- */
-static void remove_file_migration_ptes(struct page *old, struct page *new)
-{
- struct vm_area_struct *vma;
- struct address_space *mapping = page_mapping(new);
- struct prio_tree_iter iter;
- pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
- if (!mapping)
- return;
-
- spin_lock(&mapping->i_mmap_lock);
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
- remove_migration_pte(vma, old, new);
-
- spin_unlock(&mapping->i_mmap_lock);
-}
-
-/*
- * Must hold mmap_sem lock on at least one of the vmas containing
- * the page so that the anon_vma cannot vanish.
+ * Congratulations to trinity for discovering this bug.
+ * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
+ * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
+ * replace the specified range by file ptes throughout (maybe populated after).
+ * If page migration finds a page within that range, while it's still located
+ * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
+ * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
+ * But if the migrating page is in a part of the vma outside the range to be
+ * remapped, then it will not be cleared, and remove_migration_ptes() needs to
+ * deal with it. Fortunately, this part of the vma is of course still linear,
+ * so we just need to use linear location on the nonlinear list.
*/
-static void remove_anon_migration_ptes(struct page *old, struct page *new)
+static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
+ struct address_space *mapping, void *arg)
{
- struct anon_vma *anon_vma;
struct vm_area_struct *vma;
- unsigned long mapping;
-
- mapping = (unsigned long)new->mapping;
-
- if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
- return;
-
- /*
- * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
- */
- anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
+ /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ unsigned long addr;
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
- remove_migration_pte(vma, old, new);
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
- spin_unlock(&anon_vma->lock);
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (addr >= vma->vm_start && addr < vma->vm_end)
+ remove_migration_pte(page, vma, addr, arg);
+ }
+ return SWAP_AGAIN;
}
/*
@@ -242,28 +212,28 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- if (PageAnon(new))
- remove_anon_migration_ptes(old, new);
- else
- remove_file_migration_ptes(old, new);
+ struct rmap_walk_control rwc = {
+ .rmap_one = remove_migration_pte,
+ .arg = old,
+ .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
+ };
+
+ rmap_walk(new, &rwc);
}
/*
* Something used the pte of a page under migration. We need to
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
- *
- * This function is called from do_swap_page().
*/
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl)
{
- pte_t *ptep, pte;
- spinlock_t *ptl;
+ pte_t pte;
swp_entry_t entry;
struct page *page;
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
@@ -274,7 +244,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
page = migration_entry_to_page(entry);
- get_page(page);
+ /*
+ * Once radix-tree replacement of page migration started, page_count
+ * *must* be zero. And, we don't want to call wait_on_page_locked()
+ * against a page without get_page().
+ * So, we use get_page_unless_zero(), here. Even failed, page fault
+ * will occur again.
+ */
+ if (!get_page_unless_zero(page))
+ goto out;
pte_unmap_unlock(ptep, ptl);
wait_on_page_locked(page);
put_page(page);
@@ -283,62 +261,266 @@ out:
pte_unmap_unlock(ptep, ptl);
}
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ pte_t *ptep = pte_offset_map(pmd, address);
+ __migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct vm_area_struct *vma,
+ struct mm_struct *mm, pte_t *pte)
+{
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+ __migration_entry_wait(mm, pte, ptl);
+}
+
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* Replace the page in the mapping.
*
* The number of remaining references must be:
* 1 for anonymous pages without a mapping
* 2 for pages with a mapping
- * 3 for pages with a mapping and PagePrivate set.
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
-static int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode,
+ int extra_count)
{
- struct page **radix_pointer;
+ int expected_count = 1 + extra_count;
+ void **pslot;
if (!mapping) {
- /* Anonymous page */
- if (page_count(page) != 1)
+ /* Anonymous page without mapping */
+ if (page_count(page) != expected_count)
return -EAGAIN;
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
- radix_pointer = (struct page **)radix_tree_lookup_slot(
- &mapping->page_tree,
- page_index(page));
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
- if (page_count(page) != 2 + !!PagePrivate(page) ||
- *radix_pointer != page) {
- write_unlock_irq(&mapping->tree_lock);
+ expected_count += 1 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
+ * In the async migration case of moving a page with buffers, lock the
+ * buffers using trylock before the mapping is moved. If the mapping
+ * was moved, we later failed to lock the buffers and could not move
+ * the mapping back due to an elevated page count, we would have to
+ * block waiting on other references to be dropped.
+ */
+ if (mode == MIGRATE_ASYNC && head &&
+ !buffer_migrate_lock_buffers(head, mode)) {
+ page_unfreeze_refs(page, expected_count);
+ spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
/*
* Now we know that no one else is looking at the page.
*/
- get_page(newpage);
-#ifdef CONFIG_SWAP
+ get_page(newpage); /* add cache reference */
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
-#endif
- *radix_pointer = newpage;
- __put_page(page);
- write_unlock_irq(&mapping->tree_lock);
+ radix_tree_replace_slot(pslot, newpage);
- return 0;
+ /*
+ * Drop cache reference from old page by unfreezing
+ * to one less reference.
+ * We know this isn't the last reference.
+ */
+ page_unfreeze_refs(page, expected_count - 1);
+
+ /*
+ * If moved to a different zone then also account
+ * the page for that zone. Other VM counters will be
+ * taken care of when we establish references to the
+ * new page and drop references to the old page.
+ *
+ * Note that anonymous pages are accounted for
+ * via NR_FILE_PAGES and NR_ANON_PAGES if they
+ * are mapped to swap space.
+ */
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ if (!PageSwapCache(page) && PageSwapBacked(page)) {
+ __dec_zone_page_state(page, NR_SHMEM);
+ __inc_zone_page_state(newpage, NR_SHMEM);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+
+ return MIGRATEPAGE_SUCCESS;
+}
+
+/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return MIGRATEPAGE_SUCCESS;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ get_page(newpage);
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count - 1);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ return MIGRATEPAGE_SUCCESS;
+}
+
+/*
+ * Gigantic pages are so large that we do not guarantee that page++ pointer
+ * arithmetic will work across the entire page. We need something more
+ * specialized.
+ */
+static void __copy_gigantic_page(struct page *dst, struct page *src,
+ int nr_pages)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < nr_pages; ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+static void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ int nr_pages;
+
+ if (PageHuge(src)) {
+ /* hugetlbfs page */
+ struct hstate *h = page_hstate(src);
+ nr_pages = pages_per_huge_page(h);
+
+ if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
+ __copy_gigantic_page(dst, src, nr_pages);
+ return;
+ }
+ } else {
+ /* thp page */
+ BUG_ON(!PageTransHuge(src));
+ nr_pages = hpage_nr_pages(src);
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
}
/*
* Copy the page to its new location
*/
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
{
- copy_highpage(newpage, page);
+ int cpupid;
+
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
@@ -346,8 +528,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
SetPageReferenced(newpage);
if (PageUptodate(page))
SetPageUptodate(newpage);
- if (PageActive(page))
+ if (TestClearPageActive(page)) {
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
SetPageActive(newpage);
+ } else if (TestClearPageUnevictable(page))
+ SetPageUnevictable(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -355,16 +540,35 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
if (PageDirty(page)) {
clear_page_dirty_for_io(page);
- set_page_dirty(newpage);
+ /*
+ * Want to mark the page and the radix tree as dirty, and
+ * redo the accounting that clear_page_dirty_for_io undid,
+ * but we can't use set_page_dirty because that function
+ * is actually a signal that all of the page has become dirty.
+ * Whereas only part of our page may be dirty.
+ */
+ if (PageSwapBacked(page))
+ SetPageDirty(newpage);
+ else
+ __set_page_dirty_nobuffers(newpage);
}
-#ifdef CONFIG_SWAP
+ /*
+ * Copy NUMA information to the new page, to prevent over-eager
+ * future migrations of this same page.
+ */
+ cpupid = page_cpupid_xchg_last(page, -1);
+ page_cpupid_xchg_last(newpage, cpupid);
+
+ mlock_migrate_page(newpage, page);
+ ksm_migrate_page(newpage, page);
+ /*
+ * Please do not reorder this without considering how mm/ksm.c's
+ * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+ */
ClearPageSwapCache(page);
-#endif
- ClearPageActive(page);
ClearPagePrivate(page);
set_page_private(page, 0);
- page->mapping = NULL;
/*
* If any waiters have accumulated on the new page then
@@ -378,65 +582,59 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
* Migration functions
***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
-{
- return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
/*
* Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
+ * pages that do not use PagePrivate/PagePrivate2.
*
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
+#ifdef CONFIG_BLOCK
/*
* Migration function for pages with buffers. This function can only be used
* if the underlying filesystem guarantees that no other references to "page"
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
struct buffer_head *bh, *head;
int rc;
if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if (mode != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(head, mode));
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -463,9 +661,10 @@ int buffer_migrate_page(struct address_space *mapping,
} while (bh != head);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(buffer_migrate_page);
+#endif
/*
* Writeback a page to clean the dirty state
@@ -477,7 +676,6 @@ static int writeout(struct address_space *mapping, struct page *page)
.nr_to_write = 1,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1
};
int rc;
@@ -501,35 +699,36 @@ static int writeout(struct address_space *mapping, struct page *page)
remove_migration_ptes(page, page);
rc = mapping->a_ops->writepage(page, &wbc);
- if (rc < 0)
- /* I/O Error writing */
- return -EIO;
if (rc != AOP_WRITEPAGE_ACTIVATE)
/* unlocked. Relock */
lock_page(page);
- return -EAGAIN;
+ return (rc < 0) ? -EIO : -EAGAIN;
}
/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
- if (PageDirty(page))
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
return writeout(mapping, page);
+ }
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_buffers(page) &&
+ if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
/*
@@ -538,8 +737,13 @@ static int fallback_migrate_page(struct address_space *mapping,
*
* The new page will have replaced the old page if this function
* is successful.
+ *
+ * Return value:
+ * < 0 - error code
+ * MIGRATEPAGE_SUCCESS - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page)
+static int move_to_new_page(struct page *newpage, struct page *page,
+ int remap_swapcache, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -549,45 +753,192 @@ static int move_to_new_page(struct page *newpage, struct page *page)
* establishing additional references. We are the only one
* holding a reference to the new page at this point.
*/
- if (TestSetPageLocked(newpage))
+ if (!trylock_page(newpage))
BUG();
/* Prepare mapping for the new page.*/
newpage->index = page->index;
newpage->mapping = page->mapping;
+ if (PageSwapBacked(page))
+ SetPageSwapBacked(newpage);
mapping = page_mapping(page);
if (!mapping)
- rc = migrate_page(mapping, newpage, page);
+ rc = migrate_page(mapping, newpage, page, mode);
else if (mapping->a_ops->migratepage)
/*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
+ * Most pages have a mapping and most filesystems provide a
+ * migratepage callback. Anonymous pages are part of swap
+ * space which also has its own migratepage callback. This
+ * is the most common path for page migration.
*/
rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
+ newpage, page, mode);
else
- rc = fallback_migrate_page(mapping, newpage, page);
+ rc = fallback_migrate_page(mapping, newpage, page, mode);
- if (!rc)
- remove_migration_ptes(page, newpage);
- else
+ if (rc != MIGRATEPAGE_SUCCESS) {
newpage->mapping = NULL;
+ } else {
+ if (remap_swapcache)
+ remove_migration_ptes(page, newpage);
+ page->mapping = NULL;
+ }
unlock_page(newpage);
return rc;
}
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, enum migrate_mode mode)
+{
+ int rc = -EAGAIN;
+ int remap_swapcache = 1;
+ struct mem_cgroup *mem;
+ struct anon_vma *anon_vma = NULL;
+
+ if (!trylock_page(page)) {
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readpages). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto out;
+
+ lock_page(page);
+ }
+
+ /* charge against new page */
+ mem_cgroup_prepare_migration(page, newpage, &mem);
+
+ if (PageWriteback(page)) {
+ /*
+ * Only in the case of a full synchronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
+ */
+ if (mode != MIGRATE_SYNC) {
+ rc = -EBUSY;
+ goto uncharge;
+ }
+ if (!force)
+ goto uncharge;
+ wait_on_page_writeback(page);
+ }
+ /*
+ * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrates a page.
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
+ * of migration. File cache pages are no problem because of page_lock()
+ * File Caches may use write_page() or lock_page() in migration, then,
+ * just care Anon page here.
+ */
+ if (PageAnon(page) && !PageKsm(page)) {
+ /*
+ * Only page_lock_anon_vma_read() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_get_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Anon page
+ */
+ } else if (PageSwapCache(page)) {
+ /*
+ * We cannot be sure that the anon_vma of an unmapped
+ * swapcache page is safe to use because we don't
+ * know in advance if the VMA that this page belonged
+ * to still exists. If the VMA and others sharing the
+ * data have been freed, then the anon_vma could
+ * already be invalid.
+ *
+ * To avoid this possibility, swapcache pages get
+ * migrated but are not remapped when migration
+ * completes
+ */
+ remap_swapcache = 0;
+ } else {
+ goto uncharge;
+ }
+ }
+
+ if (unlikely(balloon_page_movable(page))) {
+ /*
+ * A ballooned page does not need any special attention from
+ * physical to virtual reverse mapping procedures.
+ * Skip any attempt to unmap PTEs or to remap swap cache,
+ * in order to avoid burning cycles at rmap level, and perform
+ * the page migration right away (proteced by page lock).
+ */
+ rc = balloon_page_migrate(newpage, page, mode);
+ goto uncharge;
+ }
+
+ /*
+ * Corner case handling:
+ * 1. When a new swap-cache page is read into, it is added to the LRU
+ * and treated as swapcache but it has no rmap yet.
+ * Calling try_to_unmap() against a page->mapping==NULL page will
+ * trigger a BUG. So handle it here.
+ * 2. An orphaned page (see truncate_complete_page) might have
+ * fs-private metadata. The page can be picked up due to memory
+ * offlining. Everywhere else except page reclaim, the page is
+ * invisible to the vm, so the page can not be migrated. So try to
+ * free the metadata, so the page can be freed.
+ */
+ if (!page->mapping) {
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ if (page_has_private(page)) {
+ try_to_free_buffers(page);
+ goto uncharge;
+ }
+ goto skip_unmap;
+ }
+
+ /* Establish migration ptes or remove ptes */
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+skip_unmap:
+ if (!page_mapped(page))
+ rc = move_to_new_page(newpage, page, remap_swapcache, mode);
+
+ if (rc && remap_swapcache)
+ remove_migration_ptes(page, page);
+
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+
+uncharge:
+ mem_cgroup_end_migration(mem, page, newpage,
+ (rc == MIGRATEPAGE_SUCCESS ||
+ rc == MIGRATEPAGE_BALLOON_SUCCESS));
+ unlock_page(page);
+out:
+ return rc;
+}
+
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -596,81 +947,183 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
if (!newpage)
return -ENOMEM;
- if (page_count(page) == 1)
+ if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
- goto move_newpage;
+ goto out;
+ }
- rc = -EAGAIN;
- if (TestSetPageLocked(page)) {
- if (!force)
- goto move_newpage;
- lock_page(page);
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, mode);
+
+ if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+ /*
+ * A ballooned page has been migrated already.
+ * Now, it's the time to wrap-up counters,
+ * handle the page back to Buddy and return.
+ */
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ balloon_page_free(page);
+ return MIGRATEPAGE_SUCCESS;
+ }
+out:
+ if (rc != -EAGAIN) {
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
}
- if (PageWriteback(page)) {
- if (!force)
- goto unlock;
- wait_on_page_writeback(page);
+ /*
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, putback_lru_page() will drop the reference grabbed
+ * during isolation.
+ */
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+ ClearPageSwapBacked(newpage);
+ put_new_page(newpage, private);
+ } else
+ putback_lru_page(newpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(newpage);
}
+ return rc;
+}
+
+/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ struct page *hpage, int force,
+ enum migrate_mode mode)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage;
+ struct anon_vma *anon_vma = NULL;
/*
- * Establish migration ptes or remove ptes
+ * Movability of hugepages depends on architectures and hugepage size.
+ * This check is necessary because some callers of hugepage migration
+ * like soft offline and memory hotremove don't walk through page
+ * tables or check whether the hugepage is pmd-based or not before
+ * kicking migration.
*/
- try_to_unmap(page, 1);
- if (!page_mapped(page))
- rc = move_to_new_page(newpage, page);
+ if (!hugepage_migration_supported(page_hstate(hpage))) {
+ putback_active_hugepage(hpage);
+ return -ENOSYS;
+ }
- if (rc)
- remove_migration_ptes(page, page);
+ new_hpage = get_new_page(hpage, private, &result);
+ if (!new_hpage)
+ return -ENOMEM;
-unlock:
- unlock_page(page);
+ rc = -EAGAIN;
- if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
- move_to_lru(page);
+ if (!trylock_page(hpage)) {
+ if (!force || mode != MIGRATE_SYNC)
+ goto out;
+ lock_page(hpage);
}
-move_newpage:
+ if (PageAnon(hpage))
+ anon_vma = page_get_anon_vma(hpage);
+
+ try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma)
+ put_anon_vma(anon_vma);
+
+ if (rc == MIGRATEPAGE_SUCCESS)
+ hugetlb_cgroup_migrate(hpage, new_hpage);
+
+ unlock_page(hpage);
+out:
+ if (rc != -EAGAIN)
+ putback_active_hugepage(hpage);
+
/*
- * Move the new page to the LRU. If migration was not successful
- * then this will free the page.
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, put_page() will drop the reference grabbed during
+ * isolation.
*/
- move_to_lru(newpage);
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ put_new_page(new_hpage, private);
+ else
+ put_page(new_hpage);
+
if (result) {
if (rc)
*result = rc;
else
- *result = page_to_nid(newpage);
+ *result = page_to_nid(new_hpage);
}
return rc;
}
/*
- * migrate_pages
+ * migrate_pages - migrate the pages specified in a list, to the free pages
+ * supplied as the target for the page migration
*
- * The function takes one list of pages to migrate and a function
- * that determines from the page to be migrated and the private data
- * the target of the move and allocates the page.
+ * @from: The list of pages to be migrated.
+ * @get_new_page: The function used to allocate free pages to be used
+ * as the target of the page migration.
+ * @put_new_page: The function used to free target pages if migration
+ * fails, or NULL if no special handling is necessary.
+ * @private: Private data to be passed on to get_new_page()
+ * @mode: The migration mode that specifies the constraints for
+ * page migration, if any.
+ * @reason: The reason for page migration.
*
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
- * retruned to the LRU or freed.
+ * The function returns after 10 attempts or if no pages are movable any more
+ * because the list has become empty or no retryable pages exist any more.
+ * The caller should call putback_lru_pages() to return pages to the LRU
+ * or free list only if ret != 0.
*
- * Return: Number of pages not migrated or error code.
+ * Returns the number of pages that were not migrated, or an error code.
*/
-int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private)
+int migrate_pages(struct list_head *from, new_page_t get_new_page,
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
+ int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -686,8 +1139,13 @@ int migrate_pages(struct list_head *from,
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
- rc = unmap_and_move(get_new_page, private,
- page, pass > 2);
+ if (PageHuge(page))
+ rc = unmap_and_move_huge_page(get_new_page,
+ put_new_page, private, page,
+ pass > 2, mode);
+ else
+ rc = unmap_and_move(get_new_page, put_new_page,
+ private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
@@ -695,26 +1153,33 @@ int migrate_pages(struct list_head *from,
case -EAGAIN:
retry++;
break;
- case 0:
+ case MIGRATEPAGE_SUCCESS:
+ nr_succeeded++;
break;
default:
- /* Permanent failure */
+ /*
+ * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * unlike -EAGAIN case, the failed page is
+ * removed from migration page list and not
+ * retried in the next outer loop.
+ */
nr_failed++;
break;
}
}
}
- rc = 0;
+ rc = nr_failed + retry;
out:
+ if (nr_succeeded)
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ if (nr_failed)
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- putback_lru_pages(from);
-
- if (rc)
- return rc;
-
- return nr_failed + retry;
+ return rc;
}
#ifdef CONFIG_NUMA
@@ -741,16 +1206,23 @@ static struct page *new_page_node(struct page *p, unsigned long private,
*result = &pm->status;
- return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ pm->node);
+ else
+ return alloc_pages_exact_node(pm->node,
+ GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
/*
* Move a set of pages as indicated in the pm array. The addr
* field must be set to the virtual address of the page to be moved
* and the node number must contain a valid target node.
+ * The pm array ends with node = MAX_NUMNODES.
*/
-static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
- int migrate_all)
+static int do_move_page_to_node_array(struct mm_struct *mm,
+ struct page_to_node *pm,
+ int migrate_all)
{
int err;
struct page_to_node *pp;
@@ -761,28 +1233,27 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
/*
* Build a list of pages to migrate
*/
- migrate_prep();
for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
struct vm_area_struct *vma;
struct page *page;
- /*
- * A valid page pointer that will not match any of the
- * pages that will be moved.
- */
- pp->page = ZERO_PAGE(0);
-
err = -EFAULT;
vma = find_vma(mm, pp->addr);
- if (!vma)
+ if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
+ goto set_status;
+
+ page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET);
err = -ENOENT;
if (!page)
goto set_status;
- if (PageReserved(page)) /* Check for zero page */
+ /* Use PageReserved to check for zero page */
+ if (PageReserved(page))
goto put_and_set;
pp->page = page;
@@ -799,7 +1270,17 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
!migrate_all)
goto put_and_set;
- err = isolate_lru_page(page, &pagelist);
+ if (PageHuge(page)) {
+ isolate_huge_page(page, &pagelist);
+ goto put_and_set;
+ }
+
+ err = isolate_lru_page(page);
+ if (!err) {
+ list_add_tail(&page->lru, &pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ }
put_and_set:
/*
* Either remove the duplicate refcount from
@@ -811,36 +1292,131 @@ set_status:
pp->status = err;
}
- if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm);
- else
- err = -ENOENT;
+ err = 0;
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages(&pagelist, new_page_node, NULL,
+ (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
+ if (err)
+ putback_movable_pages(&pagelist);
+ }
up_read(&mm->mmap_sem);
return err;
}
/*
- * Determine the nodes of a list of pages. The addr in the pm array
- * must have been set to the virtual address of which we want to determine
- * the node number.
+ * Migrate an array of page address onto an array of nodes and fill
+ * the corresponding array of status.
*/
-static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
+ unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
{
+ struct page_to_node *pm;
+ unsigned long chunk_nr_pages;
+ unsigned long chunk_start;
+ int err;
+
+ err = -ENOMEM;
+ pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
+ if (!pm)
+ goto out;
+
+ migrate_prep();
+
+ /*
+ * Store a chunk of page_to_node array in a page,
+ * but keep the last one as a marker
+ */
+ chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
+
+ for (chunk_start = 0;
+ chunk_start < nr_pages;
+ chunk_start += chunk_nr_pages) {
+ int j;
+
+ if (chunk_start + chunk_nr_pages > nr_pages)
+ chunk_nr_pages = nr_pages - chunk_start;
+
+ /* fill the chunk pm with addrs and nodes from user-space */
+ for (j = 0; j < chunk_nr_pages; j++) {
+ const void __user *p;
+ int node;
+
+ err = -EFAULT;
+ if (get_user(p, pages + j + chunk_start))
+ goto out_pm;
+ pm[j].addr = (unsigned long) p;
+
+ if (get_user(node, nodes + j + chunk_start))
+ goto out_pm;
+
+ err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_pm;
+
+ if (!node_state(node, N_MEMORY))
+ goto out_pm;
+
+ err = -EACCES;
+ if (!node_isset(node, task_nodes))
+ goto out_pm;
+
+ pm[j].node = node;
+ }
+
+ /* End marker for this chunk */
+ pm[chunk_nr_pages].node = MAX_NUMNODES;
+
+ /* Migrate this chunk */
+ err = do_move_page_to_node_array(mm, pm,
+ flags & MPOL_MF_MOVE_ALL);
+ if (err < 0)
+ goto out_pm;
+
+ /* Return status information */
+ for (j = 0; j < chunk_nr_pages; j++)
+ if (put_user(pm[j].status, status + j + chunk_start)) {
+ err = -EFAULT;
+ goto out_pm;
+ }
+ }
+ err = 0;
+
+out_pm:
+ free_page((unsigned long)pm);
+out:
+ return err;
+}
+
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user **pages, int *status)
+{
+ unsigned long i;
+
down_read(&mm->mmap_sem);
- for ( ; pm->node != MAX_NUMNODES; pm++) {
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long addr = (unsigned long)(*pages);
struct vm_area_struct *vma;
struct page *page;
- int err;
+ int err = -EFAULT;
- err = -EFAULT;
- vma = find_vma(mm, pm->addr);
- if (!vma)
+ vma = find_vma(mm, addr);
+ if (!vma || addr < vma->vm_start)
+ goto set_status;
+
+ page = follow_page(vma, addr, 0);
+
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
goto set_status;
- page = follow_page(vma, pm->addr, 0);
err = -ENOENT;
/* Use PageReserved to check for zero page */
if (!page || PageReserved(page))
@@ -848,28 +1424,63 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
err = page_to_nid(page);
set_status:
- pm->status = err;
+ *status = err;
+
+ pages++;
+ status++;
}
up_read(&mm->mmap_sem);
- return 0;
+}
+
+/*
+ * Determine the nodes of a user array of pages and store it in
+ * a user array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+ const void __user * __user *pages,
+ int __user *status)
+{
+#define DO_PAGES_STAT_CHUNK_NR 16
+ const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
+ int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+
+ while (nr_pages) {
+ unsigned long chunk_nr;
+
+ chunk_nr = nr_pages;
+ if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
+ chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+
+ if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
+ break;
+
+ do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+
+ if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
+ break;
+
+ pages += chunk_nr;
+ status += chunk_nr;
+ nr_pages -= chunk_nr;
+ }
+ return nr_pages ? -EFAULT : 0;
}
/*
* Move a list of pages in the address space of the currently executing
* process.
*/
-asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
- const void __user * __user *pages,
- const int __user *nodes,
- int __user *status, int flags)
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
+ const void __user * __user *, pages,
+ const int __user *, nodes,
+ int __user *, status, int, flags)
{
- int err = 0;
- int i;
+ const struct cred *cred = current_cred(), *tcred;
struct task_struct *task;
- nodemask_t task_nodes;
struct mm_struct *mm;
- struct page_to_node *pm = NULL;
+ int err;
+ nodemask_t task_nodes;
/* Check flags */
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -879,17 +1490,13 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
return -EPERM;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
- task = pid ? find_task_by_pid(pid) : current;
+ rcu_read_lock();
+ task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return -ESRCH;
}
- mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
-
- if (!mm)
- return -EINVAL;
+ get_task_struct(task);
/*
* Check if this process has the right to modify the specified
@@ -897,82 +1504,40 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- if ((current->euid != task->suid) && (current->euid != task->uid) &&
- (current->uid != task->suid) && (current->uid != task->uid) &&
+ tcred = __task_cred(task);
+ if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
!capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
- goto out2;
+ goto out;
}
+ rcu_read_unlock();
err = security_task_movememory(task);
if (err)
- goto out2;
-
+ goto out;
task_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+ put_task_struct(task);
- /* Limit nr_pages so that the multiplication may not overflow */
- if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
- err = -E2BIG;
- goto out2;
- }
-
- pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
- if (!pm) {
- err = -ENOMEM;
- goto out2;
- }
-
- /*
- * Get parameters from user space and initialize the pm
- * array. Return various errors if the user did something wrong.
- */
- for (i = 0; i < nr_pages; i++) {
- const void *p;
-
- err = -EFAULT;
- if (get_user(p, pages + i))
- goto out;
-
- pm[i].addr = (unsigned long)p;
- if (nodes) {
- int node;
-
- if (get_user(node, nodes + i))
- goto out;
-
- err = -ENODEV;
- if (!node_online(node))
- goto out;
-
- err = -EACCES;
- if (!node_isset(node, task_nodes))
- goto out;
-
- pm[i].node = node;
- }
- }
- /* End marker */
- pm[nr_pages].node = MAX_NUMNODES;
+ if (!mm)
+ return -EINVAL;
if (nodes)
- err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+ err = do_pages_move(mm, task_nodes, nr_pages, pages,
+ nodes, status, flags);
else
- err = do_pages_stat(mm, pm);
+ err = do_pages_stat(mm, nr_pages, pages, status);
- if (err >= 0)
- /* Return status information */
- for (i = 0; i < nr_pages; i++)
- if (put_user(pm[i].status, status + i))
- err = -EFAULT;
+ mmput(mm);
+ return err;
out:
- vfree(pm);
-out2:
- mmput(mm);
+ put_task_struct(task);
return err;
}
-#endif
/*
* Call migration functions in the vma_ops that may prepare
@@ -985,7 +1550,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
struct vm_area_struct *vma;
int err = 0;
- for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+ for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
if (vma->vm_ops && vma->vm_ops->migrate) {
err = vma->vm_ops->migrate(vma, to, from, flags);
if (err)
@@ -994,3 +1559,379 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ unsigned long nr_migrate_pages)
+{
+ int z;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (!zone_reclaimable(zone))
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ 0, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data,
+ int **result)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_exact_node(nid,
+ (GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE | __GFP_NOMEMALLOC |
+ __GFP_NORETRY | __GFP_NOWARN) &
+ ~GFP_IOFS, 0);
+
+ return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+ unsigned long nr_pages)
+{
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ spin_lock(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies +
+ msecs_to_jiffies(migrate_interval_millisecs);
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+ }
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
+ trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
+ nr_pages);
+ return true;
+ }
+
+ /*
+ * This is an unlocked non-atomic update so errors are possible.
+ * The consequences are failing to migrate when we potentiall should
+ * have which is not severe enough to warrant locking. If it is ever
+ * a problem, it can be converted to a per-cpu counter.
+ */
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ return false;
+}
+
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int page_lru;
+
+ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+
+ /* Avoid migrating to a node that is nearly full */
+ if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ return 0;
+
+ if (isolate_lru_page(page))
+ return 0;
+
+ /*
+ * migrate_misplaced_transhuge_page() skips page migration's usual
+ * check on page_count(), so we must do it here, now that the page
+ * has been isolated: a GUP pin, or any other pin, prevents migration.
+ * The expected page count is 3: 1 for page's mapcount and 1 for the
+ * caller's pin and 1 for the reference taken by isolate_lru_page().
+ */
+ if (PageTransHuge(page) && page_count(page) != 3) {
+ putback_lru_page(page);
+ return 0;
+ }
+
+ page_lru = page_is_file_cache(page);
+ mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+ hpage_nr_pages(page));
+
+ /*
+ * Isolating the page has taken another reference, so the
+ * caller's reference can be safely dropped without the page
+ * disappearing underneath us during migration.
+ */
+ put_page(page);
+ return 1;
+}
+
+bool pmd_trans_migrating(pmd_t pmd)
+{
+ struct page *page = pmd_page(pmd);
+ return PageLocked(page);
+}
+
+void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+ wait_on_page_locked(page);
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+ int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate file pages that are mapped in multiple processes
+ * with execute permissions as they are probably shared libraries.
+ */
+ if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+ (vma->vm_flags & VM_EXEC))
+ goto out;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, 1))
+ goto out;
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ if (!list_empty(&migratepages)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+ return isolated;
+
+out:
+ put_page(page);
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+/*
+ * Migrates a THP to a given target node. page must be locked and is unlocked
+ * before returning.
+ */
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ spinlock_t *ptl;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ pmd_t orig_entry;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+ goto out_dropref;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+ HPAGE_PMD_ORDER);
+ if (!new_page)
+ goto out_fail;
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ put_page(new_page);
+ goto out_fail;
+ }
+
+ if (mm_tlb_flush_pending(mm))
+ flush_tlb_range(vma, mmun_start, mmun_end);
+
+ /* Prepare a page as a migration target */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+fail_putback:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ /* Retake the callers reference and putback on LRU */
+ get_page(page);
+ putback_lru_page(page);
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
+
+ goto out_unlock;
+ }
+
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ orig_entry = *pmd;
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
+ /*
+ * Clear the old entry under pagetable lock and establish the new PTE.
+ * Any parallel GUP will either observe the old page blocking on the
+ * page lock, block on the page table lock or observe the new page.
+ * The SetPageUptodate on the new page and page_add_new_anon_rmap
+ * guarantee the copy is visible before the pagetable update.
+ */
+ flush_cache_range(vma, mmun_start, mmun_end);
+ page_add_anon_rmap(new_page, vma, mmun_start);
+ pmdp_clear_flush(vma, mmun_start, pmd);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+
+ if (page_count(page) != 2) {
+ set_pmd_at(mm, mmun_start, pmd, orig_entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(new_page);
+ goto fail_putback;
+ }
+
+ page_remove_rmap(page);
+
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ /* Take an "isolate" reference and put new page on the LRU. */
+ get_page(new_page);
+ putback_lru_page(new_page);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_fail:
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+out_dropref:
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_same(*pmd, entry)) {
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ }
+ spin_unlock(ptl);
+
+out_unlock:
+ unlock_page(page);
+ put_page(page);
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
diff --git a/mm/mincore.c b/mm/mincore.c
index 72890780c1c..725c8096104 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -1,35 +1,91 @@
/*
* linux/mm/mincore.c
*
- * Copyright (C) 1994-1999 Linus Torvalds
+ * Copyright (C) 1994-2006 Linus Torvalds
*/
/*
* The mincore() system call.
*/
-#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ struct hstate *h;
+
+ h = hstate_vma(vma);
+ while (1) {
+ unsigned char present;
+ pte_t *ptep;
+ /*
+ * Huge pages are always in RAM for now, but
+ * theoretically it needs to be checked.
+ */
+ ptep = huge_pte_offset(current->mm,
+ addr & huge_page_mask(h));
+ present = ptep && !huge_pte_none(huge_ptep_get(ptep));
+ while (1) {
+ *vec = present;
+ vec++;
+ addr += PAGE_SIZE;
+ if (addr == end)
+ return;
+ /* check hugepage border */
+ if (!(addr & ~huge_page_mask(h)))
+ break;
+ }
+ }
+#else
+ BUG();
+#endif
+}
+
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct vm_area_struct * vma,
- unsigned long pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
{
unsigned char present = 0;
- struct address_space * as = vma->vm_file->f_mapping;
- struct page * page;
+ struct page *page;
- page = find_get_page(as, pgoff);
+ /*
+ * When tmpfs swaps out a page from a file, any process mapping that
+ * file will not get a swp_entry_t in its pte, but rather it is like
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ */
+#ifdef CONFIG_SWAP
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
+#endif
if (page) {
present = PageUptodate(page);
page_cache_release(page);
@@ -38,46 +94,151 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
return present;
}
-static long mincore_vma(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, unsigned char __user * vec)
+static void mincore_unmapped_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
{
- long error, i, remaining;
- unsigned char * tmp;
+ unsigned long nr = (end - addr) >> PAGE_SHIFT;
+ int i;
- error = -ENOMEM;
- if (!vma->vm_file)
- return error;
+ if (vma->vm_file) {
+ pgoff_t pgoff;
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
+ }
+}
- error = -EAGAIN;
- tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
- if (!tmp)
- return error;
+static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ spinlock_t *ptl;
+ pte_t *ptep;
- /* (end - start) is # of pages, and also # of bytes in "vec */
- remaining = (end - start),
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ do {
+ pte_t pte = *ptep;
+ pgoff_t pgoff;
- error = 0;
- for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
- int j = 0;
- long thispiece = (remaining < PAGE_SIZE) ?
- remaining : PAGE_SIZE;
+ next = addr + PAGE_SIZE;
+ if (pte_none(pte))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else if (pte_present(pte))
+ *vec = 1;
+ else if (pte_file(pte)) {
+ pgoff = pte_to_pgoff(pte);
+ *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else { /* pte is a swap entry */
+ swp_entry_t entry = pte_to_swp_entry(pte);
- while (j < thispiece)
- tmp[j++] = mincore_page(vma, start++);
+ if (is_migration_entry(entry)) {
+ /* migration entries are always uptodate */
+ *vec = 1;
+ } else {
+#ifdef CONFIG_SWAP
+ pgoff = entry.val;
+ *vec = mincore_page(swap_address_space(entry),
+ pgoff);
+#else
+ WARN_ON(1);
+ *vec = 1;
+#endif
+ }
+ }
+ vec++;
+ } while (ptep++, addr = next, addr != end);
+ pte_unmap_unlock(ptep - 1, ptl);
+}
- if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
- error = -EFAULT;
- break;
+static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+ vec += (next - addr) >> PAGE_SHIFT;
+ continue;
+ }
+ /* fall through */
}
- }
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pte_range(vma, pmd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pmd++, addr = next, addr != end);
+}
- free_page((unsigned long) tmp);
- return error;
+static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pmd_range(vma, pud, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pud++, addr = next, addr != end);
+}
+
+static void mincore_page_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ unsigned long next;
+ pgd_t *pgd;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ mincore_unmapped_range(vma, addr, next, vec);
+ else
+ mincore_pud_range(vma, pgd, addr, next, vec);
+ vec += (next - addr) >> PAGE_SHIFT;
+ } while (pgd++, addr = next, addr != end);
+}
+
+/*
+ * Do a chunk of "sys_mincore()". We've already checked
+ * all the arguments, we hold the mmap semaphore: we should
+ * just return the amount of info we're asked for.
+ */
+static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma || addr < vma->vm_start)
+ return -ENOMEM;
+
+ end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+
+ if (is_vm_hugetlb_page(vma))
+ mincore_hugetlb_page_range(vma, addr, end, vec);
+ else
+ mincore_page_range(vma, addr, end, vec);
+
+ return (end - addr) >> PAGE_SHIFT;
}
/*
@@ -104,85 +265,53 @@ static long mincore_vma(struct vm_area_struct * vma,
* mapped
* -EAGAIN - A kernel resource was temporarily unavailable.
*/
-asmlinkage long sys_mincore(unsigned long start, size_t len,
- unsigned char __user * vec)
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+ unsigned char __user *, vec)
{
- int index = 0;
- unsigned long end, limit;
- struct vm_area_struct * vma;
- size_t max;
- int unmapped_error = 0;
- long error;
-
- /* check the arguments */
- if (start & ~PAGE_CACHE_MASK)
- goto einval;
-
- limit = TASK_SIZE;
- if (start >= limit)
- goto enomem;
+ long retval;
+ unsigned long pages;
+ unsigned char *tmp;
- if (!len)
- return 0;
+ /* Check the start address: needs to be page-aligned.. */
+ if (start & ~PAGE_CACHE_MASK)
+ return -EINVAL;
- max = limit - start;
- len = PAGE_CACHE_ALIGN(len);
- if (len > max || !len)
- goto enomem;
+ /* ..and we need to be passed a valid user-space range */
+ if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ return -ENOMEM;
- end = start + len;
+ /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ pages = len >> PAGE_SHIFT;
+ pages += (len & ~PAGE_MASK) != 0;
- /* check the output buffer whilst holding the lock */
- error = -EFAULT;
- down_read(&current->mm->mmap_sem);
+ if (!access_ok(VERIFY_WRITE, vec, pages))
+ return -EFAULT;
- if (!access_ok(VERIFY_WRITE, vec, len >> PAGE_SHIFT))
- goto out;
+ tmp = (void *) __get_free_page(GFP_USER);
+ if (!tmp)
+ return -EAGAIN;
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- */
- error = 0;
-
- vma = find_vma(current->mm, start);
- while (vma) {
- /* Here start < vma->vm_end. */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- }
+ retval = 0;
+ while (pages) {
+ /*
+ * Do at most PAGE_SIZE entries per iteration, due to
+ * the temporary buffer size.
+ */
+ down_read(&current->mm->mmap_sem);
+ retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+ up_read(&current->mm->mmap_sem);
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = mincore_vma(vma, start, end,
- &vec[index]);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
+ if (retval <= 0)
+ break;
+ if (copy_to_user(vec, tmp, retval)) {
+ retval = -EFAULT;
+ break;
}
-
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
- if (error)
- goto out;
- index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
- start = vma->vm_end;
- vma = vma->vm_next;
+ pages -= retval;
+ vec += retval;
+ start += retval << PAGE_SHIFT;
+ retval = 0;
}
-
- /* we found a hole in the area queried if we arrive here */
- error = -ENOMEM;
-
-out:
- up_read(&current->mm->mmap_sem);
- return error;
-
-einval:
- return -EINVAL;
-enomem:
- return -ENOMEM;
+ free_page((unsigned long) tmp);
+ return retval;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index b90c59573ab..b1eb5363400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,22 +8,554 @@
#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include "internal.h"
+int can_do_mlock(void)
+{
+ if (capable(CAP_IPC_LOCK))
+ return 1;
+ if (rlimit(RLIMIT_MEMLOCK) != 0)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(can_do_mlock);
+
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+
+/*
+ * LRU accounting for clear_page_mlock()
+ */
+void clear_page_mlock(struct page *page)
+{
+ if (!TestClearPageMlocked(page))
+ return;
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGCLEARED);
+ if (!isolate_lru_page(page)) {
+ putback_lru_page(page);
+ } else {
+ /*
+ * We lost the race. the page already moved to evictable list.
+ */
+ if (PageUnevictable(page))
+ count_vm_event(UNEVICTABLE_PGSTRANDED);
+ }
+}
+
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+ /* Serialize with page migration */
+ BUG_ON(!PageLocked(page));
+
+ if (!TestSetPageMlocked(page)) {
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ if (!isolate_lru_page(page))
+ putback_lru_page(page);
+ }
+}
+
+/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+ if (getpage)
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Finish munlock after successful page isolation
+ *
+ * Page must be locked. This is a wrapper for try_to_munlock()
+ * and putback_lru_page() with munlock accounting.
+ */
+static void __munlock_isolated_page(struct page *page)
+{
+ int ret = SWAP_AGAIN;
+
+ /*
+ * Optimization: if the page was mapped just once, that's our mapping
+ * and we don't need to check all the other vmas.
+ */
+ if (page_mapcount(page) > 1)
+ ret = try_to_munlock(page);
+
+ /* Did try_to_unlock() succeed or punt? */
+ if (ret != SWAP_MLOCK)
+ count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+ putback_lru_page(page);
+}
+
+/*
+ * Accounting for page isolation fail during munlock
+ *
+ * Performs accounting when page isolation fails in munlock. There is nothing
+ * else to do because it means some other task has already removed the page
+ * from the LRU. putback_lru_page() will take care of removing the page from
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
+ * the page back to the unevictable list if some other vma has it mlocked.
+ */
+static void __munlock_isolation_failed(struct page *page)
+{
+ if (PageUnevictable(page))
+ __count_vm_event(UNEVICTABLE_PGSTRANDED);
+ else
+ __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+}
+
+/**
+ * munlock_vma_page - munlock a vma page
+ * @page - page to be unlocked, either a normal page or THP page head
+ *
+ * returns the size of the page as a page mask (0 for normal page,
+ * HPAGE_PMD_NR - 1 for THP head page)
+ *
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ * When we munlock a page, because the vma where we found the page is being
+ * munlock()ed or munmap()ed, we want to check whether other vmas hold the
+ * page locked so that we can leave it on the unevictable lru list and not
+ * bother vmscan with it. However, to walk the page's rmap list in
+ * try_to_munlock() we must isolate the page from the LRU. If some other
+ * task has removed the page from the LRU, we won't be able to do that.
+ * So we clear the PageMlocked as we might not get another chance. If we
+ * can't isolate the page, we leave it for putback_lru_page() and vmscan
+ * [page_referenced()/try_to_unmap()] to deal with.
+ */
+unsigned int munlock_vma_page(struct page *page)
+{
+ unsigned int nr_pages;
+ struct zone *zone = page_zone(page);
+
+ /* For try_to_munlock() and to serialize with page migration */
+ BUG_ON(!PageLocked(page));
+
+ /*
+ * Serialize with any parallel __split_huge_page_refcount() which
+ * might otherwise copy PageMlocked to part of the tail pages before
+ * we clear it in the head page. It also stabilizes hpage_nr_pages().
+ */
+ spin_lock_irq(&zone->lru_lock);
+
+ nr_pages = hpage_nr_pages(page);
+ if (!TestClearPageMlocked(page))
+ goto unlock_out;
+
+ __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+
+ if (__munlock_isolate_lru_page(page, true)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __munlock_isolated_page(page);
+ goto out;
+ }
+ __munlock_isolation_failed(page);
+
+unlock_out:
+ spin_unlock_irq(&zone->lru_lock);
+
+out:
+ return nr_pages - 1;
+}
+
+/**
+ * __mlock_vma_pages_range() - mlock a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ *
+ * This takes care of making the pages present too.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held for at least read.
+ */
+long __mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON(start < vma->vm_start);
+ VM_BUG_ON(end > vma->vm_end);
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ gup_flags = FOLL_TOUCH | FOLL_MLOCK;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
+/*
+ * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
+ *
+ * The fast path is available only for evictable pages with single mapping.
+ * Then we can bypass the per-cpu pvec and get better performance.
+ * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when !page_evictable(), we need the full redo logic of putback_lru_page to
+ * avoid leaving evictable page in unevictable list.
+ *
+ * In case of success, @page is added to @pvec and @pgrescued is incremented
+ * in case that the page was previously unevictable. @page is also unlocked.
+ */
+static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
+ int *pgrescued)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (page_mapcount(page) <= 1 && page_evictable(page)) {
+ pagevec_add(pvec, page);
+ if (TestClearPageUnevictable(page))
+ (*pgrescued)++;
+ unlock_page(page);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Putback multiple evictable pages to the LRU
+ *
+ * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
+ * the pages might have meanwhile become unevictable but that is OK.
+ */
+static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+{
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
+ /*
+ *__pagevec_lru_add() calls release_pages() so we don't call
+ * put_page() explicitly
+ */
+ __pagevec_lru_add(pvec);
+ count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+}
+
+/*
+ * Munlock a batch of pages from the same zone
+ *
+ * The work is split to two main phases. First phase clears the Mlocked flag
+ * and attempts to isolate the pages, all under a single zone lru lock.
+ * The second phase finishes the munlock only for pages where isolation
+ * succeeded.
+ *
+ * Note that the pagevec may be modified during the process.
+ */
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+{
+ int i;
+ int nr = pagevec_count(pvec);
+ int delta_munlocked;
+ struct pagevec pvec_putback;
+ int pgrescued = 0;
+
+ pagevec_init(&pvec_putback, 0);
+
+ /* Phase 1: page isolation */
+ spin_lock_irq(&zone->lru_lock);
+ for (i = 0; i < nr; i++) {
+ struct page *page = pvec->pages[i];
+
+ if (TestClearPageMlocked(page)) {
+ /*
+ * We already have pin from follow_page_mask()
+ * so we can spare the get_page() here.
+ */
+ if (__munlock_isolate_lru_page(page, false))
+ continue;
+ else
+ __munlock_isolation_failed(page);
+ }
+
+ /*
+ * We won't be munlocking this page in the next phase
+ * but we still need to release the follow_page_mask()
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release() would deadlock.
+ */
+ pagevec_add(&pvec_putback, pvec->pages[i]);
+ pvec->pages[i] = NULL;
+ }
+ delta_munlocked = -nr + pagevec_count(&pvec_putback);
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ spin_unlock_irq(&zone->lru_lock);
+
+ /* Now we can release pins of pages that we are not munlocking */
+ pagevec_release(&pvec_putback);
+
+ /* Phase 2: page munlock */
+ for (i = 0; i < nr; i++) {
+ struct page *page = pvec->pages[i];
+
+ if (page) {
+ lock_page(page);
+ if (!__putback_lru_fast_prepare(page, &pvec_putback,
+ &pgrescued)) {
+ /*
+ * Slow path. We don't want to lose the last
+ * pin before unlock_page()
+ */
+ get_page(page); /* for putback_lru_page() */
+ __munlock_isolated_page(page);
+ unlock_page(page);
+ put_page(page); /* from follow_page_mask() */
+ }
+ }
+ }
+
+ /*
+ * Phase 3: page putback for pages that qualified for the fast path
+ * This will also call put_page() to return pin from follow_page_mask()
+ */
+ if (pagevec_count(&pvec_putback))
+ __putback_lru_fast(&pvec_putback, pgrescued);
+}
+
+/*
+ * Fill up pagevec for __munlock_pagevec using pte walk
+ *
+ * The function expects that the struct page corresponding to @start address is
+ * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
+ *
+ * The rest of @pvec is filled by subsequent pages within the same pmd and same
+ * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * pages also get pinned.
+ *
+ * Returns the address of the next page that should be scanned. This equals
+ * @start + PAGE_SIZE when no page could be added by the pte walk.
+ */
+static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
+ struct vm_area_struct *vma, int zoneid, unsigned long start,
+ unsigned long end)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ /*
+ * Initialize pte walk starting at the already pinned page where we
+ * are sure that there is a pte, as it was pinned under the same
+ * mmap_sem write op.
+ */
+ pte = get_locked_pte(vma->vm_mm, start, &ptl);
+ /* Make sure we do not cross the page table boundary */
+ end = pgd_addr_end(start, end);
+ end = pud_addr_end(start, end);
+ end = pmd_addr_end(start, end);
+
+ /* The page next to the pinned page is the first we will try to get */
+ start += PAGE_SIZE;
+ while (start < end) {
+ struct page *page = NULL;
+ pte++;
+ if (pte_present(*pte))
+ page = vm_normal_page(vma, start, *pte);
+ /*
+ * Break if page could not be obtained or the page's node+zone does not
+ * match
+ */
+ if (!page || page_zone_id(page) != zoneid)
+ break;
+
+ get_page(page);
+ /*
+ * Increase the address that will be returned *before* the
+ * eventual break due to pvec becoming full by adding the page
+ */
+ start += PAGE_SIZE;
+ if (pagevec_add(pvec, page) == 0)
+ break;
+ }
+ pte_unmap_unlock(pte, ptl);
+ return start;
+}
+
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ * For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared. Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru. In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them. This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ vma->vm_flags &= ~VM_LOCKED;
+
+ while (start < end) {
+ struct page *page = NULL;
+ unsigned int page_mask;
+ unsigned long page_increm;
+ struct pagevec pvec;
+ struct zone *zone;
+ int zoneid;
+
+ pagevec_init(&pvec, 0);
+ /*
+ * Although FOLL_DUMP is intended for get_dump_page(),
+ * it just so happens that its special treatment of the
+ * ZERO_PAGE (returning an error instead of doing get_page)
+ * suits munlock very well (and if somehow an abnormal page
+ * has sneaked into the range, we won't oops here: great).
+ */
+ page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
+ &page_mask);
+
+ if (page && !IS_ERR(page)) {
+ if (PageTransHuge(page)) {
+ lock_page(page);
+ /*
+ * Any THP page found by follow_page_mask() may
+ * have gotten split before reaching
+ * munlock_vma_page(), so we need to recompute
+ * the page_mask here.
+ */
+ page_mask = munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page); /* follow_page_mask() */
+ } else {
+ /*
+ * Non-huge pages are handled in batches via
+ * pagevec. The pin from follow_page_mask()
+ * prevents them from collapsing by THP.
+ */
+ pagevec_add(&pvec, page);
+ zone = page_zone(page);
+ zoneid = page_zone_id(page);
+
+ /*
+ * Try to fill the rest of pagevec using fast
+ * pte walk. This will also update start to
+ * the next page to process. Then munlock the
+ * pagevec.
+ */
+ start = __munlock_pagevec_fill(&pvec, vma,
+ zoneid, start, end);
+ __munlock_pagevec(&pvec, zone);
+ goto next;
+ }
+ }
+ /* It's a bug to munlock in the middle of a THP page */
+ VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + page_mask;
+ start += page_increm * PAGE_SIZE;
+next:
+ cond_resched();
+ }
+}
+
+/*
+ * mlock_fixup - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op. However, for some special vmas, we go ahead and
+ * populate the ptes.
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, unsigned int newflags)
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
- struct mm_struct * mm = vma->vm_mm;
+ struct mm_struct *mm = vma->vm_mm;
pgoff_t pgoff;
- int pages;
+ int nr_pages;
int ret = 0;
+ int lock = !!(newflags & VM_LOCKED);
- if (newflags == vma->vm_flags) {
- *prev = vma;
- goto out;
- }
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
+ goto out; /* don't set VM_LOCKED, don't count */
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
@@ -33,8 +565,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto success;
}
- *prev = vma;
-
if (start != vma->vm_start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
@@ -49,26 +579,26 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
success:
/*
- * vm_flags is protected by the mmap_sem held in write mode.
- * It's okay if try_to_unmap_one unmaps a page just after we
- * set VM_LOCKED, make_pages_present below will bring it back.
+ * Keep track of amount of locked VM.
*/
- vma->vm_flags = newflags;
+ nr_pages = (end - start) >> PAGE_SHIFT;
+ if (!lock)
+ nr_pages = -nr_pages;
+ mm->locked_vm += nr_pages;
/*
- * Keep track of amount of locked VM.
+ * vm_flags is protected by the mmap_sem held in write mode.
+ * It's okay if try_to_unmap_one unmaps a page just after we
+ * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- pages = (end - start) >> PAGE_SHIFT;
- if (newflags & VM_LOCKED) {
- pages = -pages;
- if (!(newflags & VM_IO))
- ret = make_pages_present(start, end);
- }
- vma->vm_mm->locked_vm -= pages;
+ if (lock)
+ vma->vm_flags = newflags;
+ else
+ munlock_vma_pages_range(vma, start, end);
+
out:
- if (ret == -ENOMEM)
- ret = -EAGAIN;
+ *prev = vma;
return ret;
}
@@ -78,27 +608,29 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;
- len = PAGE_ALIGN(len);
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma(current->mm, start);
if (!vma || vma->vm_start > start)
return -ENOMEM;
+ prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
for (nstart = start ; ; ) {
- unsigned int newflags;
+ vm_flags_t newflags;
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vma->vm_flags | VM_LOCKED;
- if (!on)
- newflags &= ~VM_LOCKED;
+ newflags = vma->vm_flags & ~VM_LOCKED;
+ if (on)
+ newflags |= VM_LOCKED;
tmp = vma->vm_end;
if (tmp > end)
@@ -121,7 +653,70 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-asmlinkage long sys_mlock(unsigned long start, size_t len)
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. __mlock_vma_pages_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ ret = __mlock_posix_error_return(ret);
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
unsigned long lock_limit;
@@ -130,61 +725,70 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
if (!can_do_mlock())
return -EPERM;
- down_write(&current->mm->mmap_sem);
+ lru_add_drain_all(); /* flush pagevec */
+
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
locked = len >> PAGE_SHIFT;
- locked += current->mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
+ down_write(&current->mm->mmap_sem);
+
+ locked += current->mm->locked_vm;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
+
up_write(&current->mm->mmap_sem);
+ if (!error)
+ error = __mm_populate(start, len, 0);
return error;
}
-asmlinkage long sys_munlock(unsigned long start, size_t len)
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
+
+ down_write(&current->mm->mmap_sem);
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
+
return ret;
}
static int do_mlockall(int flags)
{
struct vm_area_struct * vma, * prev = NULL;
- unsigned int def_flags = 0;
if (flags & MCL_FUTURE)
- def_flags = VM_LOCKED;
- current->mm->def_flags = def_flags;
+ current->mm->def_flags |= VM_LOCKED;
+ else
+ current->mm->def_flags &= ~VM_LOCKED;
if (flags == MCL_FUTURE)
goto out;
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
- unsigned int newflags;
+ vm_flags_t newflags;
- newflags = vma->vm_flags | VM_LOCKED;
- if (!(flags & MCL_CURRENT))
- newflags &= ~VM_LOCKED;
+ newflags = vma->vm_flags & ~VM_LOCKED;
+ if (flags & MCL_CURRENT)
+ newflags |= VM_LOCKED;
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ cond_resched();
}
out:
return 0;
}
-asmlinkage long sys_mlockall(int flags)
+SYSCALL_DEFINE1(mlockall, int, flags)
{
unsigned long lock_limit;
int ret = -EINVAL;
@@ -196,21 +800,26 @@ asmlinkage long sys_mlockall(int flags)
if (!can_do_mlock())
goto out;
- down_write(&current->mm->mmap_sem);
+ if (flags & MCL_CURRENT)
+ lru_add_drain_all(); /* flush pagevec */
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
+ down_write(&current->mm->mmap_sem);
+
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
+ if (!ret && (flags & MCL_CURRENT))
+ mm_populate(0, TASK_SIZE);
out:
return ret;
}
-asmlinkage long sys_munlockall(void)
+SYSCALL_DEFINE0(munlockall)
{
int ret;
@@ -232,10 +841,13 @@ int user_shm_lock(size_t size, struct user_struct *user)
int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ if (lock_limit == RLIM_INFINITY)
+ allowed = 1;
lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
- if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+ if (!allowed &&
+ locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
goto out;
get_uid(user);
user->locked_shm += locked;
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 00000000000..4074caf9936
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,205 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
+#include "internal.h"
+
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int mminit_loglevel;
+
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT 0
+#endif
+
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+ int nid;
+
+ if (mminit_loglevel < MMINIT_VERIFY)
+ return;
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone;
+ struct zoneref *z;
+ struct zonelist *zonelist;
+ int i, listid, zoneid;
+
+ BUG_ON(MAX_ZONELISTS > 2);
+ for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+
+ /* Identify the zone and nodelist */
+ zoneid = i % MAX_NR_ZONES;
+ listid = i / MAX_NR_ZONES;
+ zonelist = &pgdat->node_zonelists[listid];
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Print information about the zonelist */
+ printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+ listid > 0 ? "thisnode" : "general", nid,
+ zone->name);
+
+ /* Iterate the zonelist */
+ for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+ printk(KERN_CONT "%d:%s ",
+ zone->node, zone->name);
+#else
+ printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+ }
+ printk(KERN_CONT "\n");
+ }
+ }
+}
+
+void __init mminit_verify_pageflags_layout(void)
+{
+ int shift, width;
+ unsigned long or_mask, add_mask;
+
+ shift = 8 * sizeof(unsigned long);
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+ "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ SECTIONS_WIDTH,
+ NODES_WIDTH,
+ ZONES_WIDTH,
+ LAST_CPUPID_WIDTH,
+ NR_PAGEFLAGS);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ "Section %d Node %d Zone %d Lastcpupid %d\n",
+ SECTIONS_SHIFT,
+ NODES_SHIFT,
+ ZONES_SHIFT,
+ LAST_CPUPID_SHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ (unsigned long)SECTIONS_PGSHIFT,
+ (unsigned long)NODES_PGSHIFT,
+ (unsigned long)ZONES_PGSHIFT,
+ (unsigned long)LAST_CPUPID_PGSHIFT);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
+ "Node/Zone ID: %lu -> %lu\n",
+ (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
+ (unsigned long)ZONEID_PGOFF);
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+ "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
+ shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Node not in page flags");
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+ "Last cpupid not in page flags");
+#endif
+
+ if (SECTIONS_WIDTH) {
+ shift -= SECTIONS_WIDTH;
+ BUG_ON(shift != SECTIONS_PGSHIFT);
+ }
+ if (NODES_WIDTH) {
+ shift -= NODES_WIDTH;
+ BUG_ON(shift != NODES_PGSHIFT);
+ }
+ if (ZONES_WIDTH) {
+ shift -= ZONES_WIDTH;
+ BUG_ON(shift != ZONES_PGSHIFT);
+ }
+
+ /* Check for bitmask overlaps */
+ or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+ (NODES_MASK << NODES_PGSHIFT) |
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+ (NODES_MASK << NODES_PGSHIFT) +
+ (SECTIONS_MASK << SECTIONS_PGSHIFT);
+ BUG_ON(or_mask != add_mask);
+}
+
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+ unsigned long nid, unsigned long pfn)
+{
+ BUG_ON(page_to_nid(page) != nid);
+ BUG_ON(page_zonenum(page) != zone);
+ BUG_ON(page_to_pfn(page) != pfn);
+}
+
+static __init int set_mminit_loglevel(char *str)
+{
+ get_option(&str, &mminit_loglevel);
+ return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+
+static void __meminit mm_compute_batch(void)
+{
+ u64 memsized_batch;
+ s32 nr = num_present_cpus();
+ s32 batch = max_t(s32, nr*2, 32);
+
+ /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+ memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+
+ vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ switch (action) {
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ mm_compute_batch();
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block compute_batch_nb __meminitdata = {
+ .notifier_call = mm_compute_batch_notifier,
+ .priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+
+static int __init mm_compute_batch_init(void)
+{
+ mm_compute_batch();
+ register_hotmemory_notifier(&compute_batch_nb);
+
+ return 0;
+}
+
+__initcall(mm_compute_batch_init);
+
+#endif
+
+static int __init mm_sysfs_init(void)
+{
+ mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+ if (!mm_kobj)
+ return -ENOMEM;
+
+ return 0;
+}
+postcore_initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index eea8eefd51a..129b847d30c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,11 +3,16 @@
*
* Written by obz.
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -21,29 +26,40 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/profile.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/perf_event.h>
+#include <linux/audit.h>
+#include <linux/khugepaged.h>
+#include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+
+#include "internal.h"
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags) (0)
#endif
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len) (addr)
+#endif
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
@@ -66,15 +82,37 @@ pgprot_t protection_map[16] = {
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- return protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ return __pgprot(pgprot_val(protection_map[vm_flags &
+ (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
+ pgprot_val(arch_vm_get_page_prot(vm_flags)));
}
EXPORT_SYMBOL(vm_get_page_prot);
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
+int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
/*
* Check that a process has enough memory to allocate a new virtual
@@ -92,9 +130,9 @@ atomic_t vm_committed_space = ATOMIC_INIT(0);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -105,10 +143,18 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
- free = global_page_state(NR_FILE_PAGES);
- free += nr_swap_pages;
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
/*
* Any slabs which are created with the
@@ -119,34 +165,18 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -154,24 +184,22 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
goto error;
}
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
- allowed += total_swap_pages;
-
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
+ * Don't let a single process grow so big a user can't recover
*/
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
@@ -179,29 +207,27 @@ error:
return -ENOMEM;
}
-EXPORT_SYMBOL(__vm_enough_memory);
-
/*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
*/
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ atomic_inc(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable--;
flush_dcache_mmap_lock(mapping);
if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.vm_set.list);
+ list_del_init(&vma->shared.nonlinear);
else
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
*/
void unlink_file_vma(struct vm_area_struct *vma)
@@ -210,9 +236,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
if (file) {
struct address_space *mapping = file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
__remove_shared_vm_struct(vma, file, mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
}
@@ -228,20 +254,37 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
- mpol_free(vma_policy(vma));
+ mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
}
-asmlinkage unsigned long sys_brk(unsigned long brk)
+static unsigned long do_brk(unsigned long addr, unsigned long len);
+
+SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
+ unsigned long min_brk;
+ bool populate;
down_write(&mm->mmap_sem);
- if (brk < mm->end_code)
+#ifdef CONFIG_COMPAT_BRK
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (current->brk_randomized)
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
+#else
+ min_brk = mm->start_brk;
+#endif
+ if (brk < min_brk)
goto out;
/*
@@ -250,8 +293,9 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
- if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
+ rlim = rlimit(RLIMIT_DATA);
+ if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
goto out;
newbrk = PAGE_ALIGN(brk);
@@ -273,73 +317,217 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
/* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
+
set_brk:
mm->brk = brk;
+ populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(oldbrk, newbrk - oldbrk);
+ return brk;
+
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
return retval;
}
-#ifdef DEBUG_MM_RB
+static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max, subtree_gap;
+ max = vma->vm_start;
+ if (vma->vm_prev)
+ max -= vma->vm_prev->vm_end;
+ if (vma->vm_rb.rb_left) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ if (vma->vm_rb.rb_right) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ return max;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
{
- int i = 0, j;
+ int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev)
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
- if (vma->vm_start < pend)
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
- if (vma->vm_start > vma->vm_end)
- printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+ if (vma->vm_start < prev) {
+ pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ bug = 1;
+ }
+ if (vma->vm_start < pend) {
+ pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ bug = 1;
+ }
+ if (vma->vm_start > vma->vm_end) {
+ pr_info("vm_end %lx < vm_start %lx\n",
+ vma->vm_end, vma->vm_start);
+ bug = 1;
+ }
+ if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+ pr_info("free gap %lx, correct %lx\n",
+ vma->rb_subtree_gap,
+ vma_compute_subtree_gap(vma));
+ bug = 1;
+ }
i++;
pn = nd;
+ prev = vma->vm_start;
+ pend = vma->vm_end;
}
j = 0;
- for (nd = pn; nd; nd = rb_prev(nd)) {
+ for (nd = pn; nd; nd = rb_prev(nd))
j++;
+ if (i != j) {
+ pr_info("backwards %d, forwards %d\n", j, i);
+ bug = 1;
}
- if (i != j)
- printk("backwards %d, forwards %d\n", j, i), i = 0;
- return i;
+ return bug ? -1 : i;
}
-void validate_mm(struct mm_struct *mm)
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+ struct rb_node *nd;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ BUG_ON(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
+ }
+}
+
+static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- struct vm_area_struct *tmp = mm->mmap;
- while (tmp) {
- tmp = tmp->vm_next;
+ unsigned long highest_address = 0;
+ struct vm_area_struct *vma = mm->mmap;
+ while (vma) {
+ struct anon_vma_chain *avc;
+ vma_lock_anon_vma(vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ vma_unlock_anon_vma(vma);
+ highest_address = vma->vm_end;
+ vma = vma->vm_next;
i++;
}
- if (i != mm->map_count)
- printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ pr_info("map_count %d vm_next %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ if (highest_address != mm->highest_vm_end) {
+ pr_info("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
+ bug = 1;
+ }
i = browse_rb(&mm->mm_rb);
- if (i != mm->map_count)
- printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ pr_info("map_count %d rb %d\n", mm->map_count, i);
+ bug = 1;
+ }
BUG_ON(bug);
}
#else
+#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev, struct rb_node ***rb_link,
- struct rb_node ** rb_parent)
+RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
{
- struct vm_area_struct * vma;
- struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+ /*
+ * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
+ * function that does exacltly what we want.
+ */
+ vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+ struct rb_root *root)
+{
+ /* All rb_subtree_gap values must be consistent prior to insertion */
+ validate_mm_rb(root, NULL);
+
+ rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+ /*
+ * All rb_subtree_gap values must be consistent prior to erase,
+ * with the possible exception of the vma being erased.
+ */
+ validate_mm_rb(root, vma);
+
+ /*
+ * Note rb_erase_augmented is a fairly large inline function,
+ * so make sure we instantiate it only once with our desired
+ * augmented rbtree callbacks.
+ */
+ rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+ unsigned long end, struct vm_area_struct **pprev,
+ struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+ struct rb_node **__rb_link, *__rb_parent, *rb_prev;
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
- vma = NULL;
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
@@ -348,9 +536,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- return vma;
+ /* Fail if an existing vma overlaps the area */
+ if (vma_tmp->vm_start < end)
+ return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
@@ -363,43 +551,71 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
*rb_parent = __rb_parent;
- return vma;
+ return 0;
}
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node *rb_parent)
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
- if (prev) {
- vma->vm_next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- mm->mmap = vma;
- if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
- struct vm_area_struct, vm_rb);
- else
- vma->vm_next = NULL;
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
}
+
+ return nr_pages;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
+ /* Update tracking information for the gap following the new vma. */
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ mm->highest_vm_end = vma->vm_end;
+
+ /*
+ * vma->vm_prev wasn't known when we followed the rbtree to find the
+ * correct insertion point for that vma. As a result, we could not
+ * update the vma vm_rb parents rb_subtree_gap values on the way down.
+ * So, we first insert the vma with a zero rb_subtree_gap value
+ * (to be consistent with what we did on the way down), and then
+ * immediately update the gap to the correct value. Finally we
+ * rebalance the rbtree after all augmented values have been set.
+ */
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ vma->rb_subtree_gap = 0;
+ vma_gap_update(vma);
+ vma_rb_insert(vma, &mm->mm_rb);
}
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
{
- struct file * file;
+ struct file *file;
file = vma->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file->f_dentry->d_inode->i_writecount);
+ atomic_dec(&file_inode(file)->i_writecount);
if (vma->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
@@ -407,7 +623,7 @@ static inline void __vma_link_file(struct vm_area_struct *vma)
if (unlikely(vma->vm_flags & VM_NONLINEAR))
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
else
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -419,7 +635,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
__vma_link_list(mm, vma, prev, rb_parent);
__vma_link_rb(mm, vma, rb_link, rb_parent);
- __anon_vma_link(vma);
}
static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -428,39 +643,33 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct address_space *mapping = NULL;
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
-
- if (mapping) {
- spin_lock(&mapping->i_mmap_lock);
- vma->vm_truncate_count = mapping->truncate_count;
+ mutex_lock(&mapping->i_mmap_mutex);
}
- anon_vma_lock(vma);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
- anon_vma_unlock(vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
mm->map_count++;
validate_mm(mm);
}
/*
- * Helper for vma_adjust in the split_vma insert case:
- * insert vm structure into list and rbtree and anon_vma,
- * but it has already been inserted into prio_tree earlier.
+ * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
+ * mm's list and rbtree. It has already been inserted into the interval tree.
*/
-static void
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
- __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
}
@@ -469,10 +678,15 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
- rb_erase(&vma->vm_rb, &mm->mm_rb);
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+ struct vm_area_struct *next;
+
+ vma_rb_erase(vma, &mm->mm_rb);
+ prev->vm_next = next = vma->vm_next;
+ if (next)
+ next->vm_prev = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -482,20 +696,23 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
-void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next;
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
- struct prio_tree_root *root = NULL;
- struct file *file = vma->vm_file;
+ struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
+ struct file *file = vma->vm_file;
+ bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
if (next && !insert) {
+ struct vm_area_struct *exporter = NULL;
+
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -503,7 +720,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
*/
again: remove_next = 1 + (end > next->vm_end);
end = next->vm_end;
- anon_vma = next->anon_vma;
+ exporter = next;
importer = vma;
} else if (end > next->vm_start) {
/*
@@ -511,7 +728,7 @@ again: remove_next = 1 + (end > next->vm_end);
* mprotect case 5 shifting the boundary up.
*/
adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
- anon_vma = next->anon_vma;
+ exporter = next;
importer = vma;
} else if (end < vma->vm_end) {
/*
@@ -520,28 +737,37 @@ again: remove_next = 1 + (end > next->vm_end);
* mprotect case 4 shifting the boundary down.
*/
adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
- anon_vma = next->anon_vma;
+ exporter = vma;
importer = next;
}
+
+ /*
+ * Easily overlooked: when mprotect shifts the boundary,
+ * make sure the expanding vma has anon_vma set if the
+ * shrinking vma had, to cover any anon pages imported.
+ */
+ if (exporter && exporter->anon_vma && !importer->anon_vma) {
+ if (anon_vma_clone(importer, exporter))
+ return -ENOMEM;
+ importer->anon_vma = exporter->anon_vma;
+ }
}
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR))
+ if (!(vma->vm_flags & VM_NONLINEAR)) {
root = &mapping->i_mmap;
- spin_lock(&mapping->i_mmap_lock);
- if (importer &&
- vma->vm_truncate_count != next->vm_truncate_count) {
- /*
- * unmap_mapping_range might be in progress:
- * ensure that the expanding vma is rescanned.
- */
- importer->vm_truncate_count = 0;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start,
+ next->vm_end);
}
+
+ mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
- insert->vm_truncate_count = vma->vm_truncate_count;
/*
- * Put into prio_tree now, so instantiated pages
+ * Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
@@ -550,34 +776,35 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
- /*
- * When changing only vma->vm_end, we don't really need
- * anon_vma lock: but is that case worth optimizing out?
- */
- if (vma->anon_vma)
- anon_vma = vma->anon_vma;
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
+ anon_vma = vma->anon_vma;
+ if (!anon_vma && adjust_next)
+ anon_vma = next->anon_vma;
if (anon_vma) {
- spin_lock(&anon_vma->lock);
- /*
- * Easily overlooked: when mprotect shifts the boundary,
- * make sure the expanding vma has anon_vma set if the
- * shrinking vma had, to cover any anon pages imported.
- */
- if (importer && !importer->anon_vma) {
- importer->anon_vma = anon_vma;
- __anon_vma_link(importer);
- }
+ VM_BUG_ON(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma);
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_pre_update_vma(next);
}
if (root) {
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, root);
+ vma_interval_tree_remove(vma, root);
if (adjust_next)
- vma_prio_tree_remove(next, root);
+ vma_interval_tree_remove(next, root);
}
- vma->vm_start = start;
- vma->vm_end = end;
+ if (start != vma->vm_start) {
+ vma->vm_start = start;
+ start_changed = true;
+ }
+ if (end != vma->vm_end) {
+ vma->vm_end = end;
+ end_changed = true;
+ }
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
@@ -586,8 +813,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (root) {
if (adjust_next)
- vma_prio_tree_insert(next, root);
- vma_prio_tree_insert(vma, root);
+ vma_interval_tree_insert(next, root);
+ vma_interval_tree_insert(vma, root);
flush_dcache_mmap_unlock(mapping);
}
@@ -599,8 +826,6 @@ again: remove_next = 1 + (end > next->vm_end);
__vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
- if (next->anon_vma)
- __anon_vma_merge(vma, next);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
@@ -608,43 +833,80 @@ again: remove_next = 1 + (end > next->vm_end);
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
+ } else {
+ if (start_changed)
+ vma_gap_update(vma);
+ if (end_changed) {
+ if (!next)
+ mm->highest_vm_end = end;
+ else if (!adjust_next)
+ vma_gap_update(next);
+ }
}
- if (anon_vma)
- spin_unlock(&anon_vma->lock);
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_post_update_vma(next);
+ anon_vma_unlock_write(anon_vma);
+ }
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ if (root) {
+ uprobe_mmap(vma);
+
+ if (adjust_next)
+ uprobe_mmap(next);
+ }
if (remove_next) {
- if (file)
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
mm->map_count--;
- mpol_free(vma_policy(next));
+ mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
- if (remove_next == 2) {
- next = vma->vm_next;
+ next = vma->vm_next;
+ if (remove_next == 2)
goto again;
- }
+ else if (next)
+ vma_gap_update(next);
+ else
+ mm->highest_vm_end = end;
}
+ if (insert && file)
+ uprobe_mmap(insert);
validate_mm(mm);
+
+ return 0;
}
/*
* If the vma has a ->close operation then the driver probably needs to release
* per-vma resources, so we don't attempt to merge those.
*/
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
-
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags != vm_flags)
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressue on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
@@ -654,9 +916,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
}
static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2)
+ struct anon_vma *anon_vma2,
+ struct vm_area_struct *vma)
{
- return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ if ((!anon_vma1 || !anon_vma2) && (!vma ||
+ list_is_singular(&vma->anon_vma_chain)))
+ return 1;
+ return anon_vma1 == anon_vma2;
}
/*
@@ -675,7 +945,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
}
@@ -694,9 +964,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
{
if (is_mergeable_vma(vma, file, vm_flags) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+ is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
- vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
return 1;
}
@@ -740,6 +1010,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
+ int err;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -771,13 +1042,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen) &&
is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma)) {
+ next->anon_vma, NULL)) {
/* cases 1, 6 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL);
} else /* cases 2, 5, 7 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL);
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(prev);
return prev;
}
@@ -789,11 +1063,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
- vma_adjust(prev, prev->vm_start,
+ err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
else /* cases 3, 8 */
- vma_adjust(area, addr, next->vm_end,
+ err = vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL);
+ if (err)
+ return NULL;
+ khugepaged_enter_vma_merge(area);
return area;
}
@@ -801,6 +1078,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
/*
+ * Rough compatbility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ return a->vm_end == b->vm_start &&
+ mpol_equal(vma_policy(a), vma_policy(b)) &&
+ a->vm_file == b->vm_file &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
+ b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mm_sem.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ if (anon_vma_compatible(a, b)) {
+ struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+
+ if (anon_vma && list_is_singular(&old->anon_vma_chain))
+ return anon_vma;
+ }
+ return NULL;
+}
+
+/*
* find_mergeable_anon_vma is used by anon_vma_prepare, to check
* neighbouring vmas for a suitable anon_vma, before it goes off
* to allocate a new anon_vma. It checks because a repetitive
@@ -810,48 +1142,24 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ struct anon_vma *anon_vma;
struct vm_area_struct *near;
- unsigned long vm_flags;
near = vma->vm_next;
if (!near)
goto try_prev;
- /*
- * Since only mprotect tries to remerge vmas, match flags
- * which might be mprotected into each other later on.
- * Neither mlock nor madvise tries to remerge at present,
- * so leave their flags as obstructing a merge.
- */
- vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
- vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
- if (near->anon_vma && vma->vm_end == near->vm_start &&
- mpol_equal(vma_policy(vma), vma_policy(near)) &&
- can_vma_merge_before(near, vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff +
- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
- return near->anon_vma;
+ anon_vma = reusable_anon_vma(near, vma, near);
+ if (anon_vma)
+ return anon_vma;
try_prev:
- /*
- * It is potentially slow to have to call find_vma_prev here.
- * But it's only on the first write fault on the vma, not
- * every time, and we could devise a way to avoid it later
- * (e.g. stash info in next's anon_vma_node when assigning
- * an anon_vma, or when trying vma_merge). Another time.
- */
- BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
+ near = vma->vm_prev;
if (!near)
goto none;
- vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
- vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-
- if (near->anon_vma && near->vm_end == vma->vm_start &&
- mpol_equal(vma_policy(near), vma_policy(vma)) &&
- can_vma_merge_after(near, vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff))
- return near->anon_vma;
+ anon_vma = reusable_anon_vma(near, near, vma);
+ if (anon_vma)
+ return anon_vma;
none:
/*
* There's no absolute need to look only at touching neighbours:
@@ -871,46 +1179,62 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
const unsigned long stack_flags
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+ mm->total_vm += pages;
+
if (file) {
mm->shared_vm += pages;
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
mm->exec_vm += pages;
} else if (flags & stack_flags)
mm->stack_vm += pages;
- if (flags & (VM_RESERVED|VM_IO))
- mm->reserved_vm += pages;
}
#endif /* CONFIG_PROC_FS */
/*
- * The caller must hold down_write(current->mm->mmap_sem).
+ * If a hint addr is less than mmap_min_addr change hint to be as
+ * low as possible but still greater than mmap_min_addr
+ */
+static inline unsigned long round_hint_to_min(unsigned long hint)
+{
+ hint &= PAGE_MASK;
+ if (((void *)hint != NULL) &&
+ (hint < mmap_min_addr))
+ return PAGE_ALIGN(mmap_min_addr);
+ return hint;
+}
+
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+/*
+ * The caller must hold down_write(&current->mm->mmap_sem).
*/
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate)
{
struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
- struct inode *inode;
- unsigned int vm_flags;
- int correct_wcount = 0;
- int error;
- struct rb_node ** rb_link, * rb_parent;
- int accountable = 1;
- unsigned long charged = 0, reqprot = prot;
+ vm_flags_t vm_flags;
- if (file) {
- if (is_file_hugepages(file))
- accountable = 0;
+ *populate = 0;
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
-
- if ((prot & PROT_EXEC) &&
- (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
- return -EPERM;
- }
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
@@ -918,19 +1242,18 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
+ if (!(flags & MAP_FIXED))
+ addr = round_hint_to_min(addr);
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
+ if (!len)
return -ENOMEM;
/* offset overflow? */
@@ -955,25 +1278,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED) {
+ if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
- vm_flags |= VM_LOCKED;
- }
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- inode = file ? file->f_dentry->d_inode : NULL;
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
+ struct inode *inode = file_inode(file);
+
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -989,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
/*
* Make sure there are no mandatory locks on the file.
*/
- if (locks_verify_locked(inode))
+ if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
@@ -1000,6 +1314,16 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (vm_flags & VM_EXEC)
+ return -EPERM;
+ vm_flags &= ~VM_MAYEXEC;
+ }
+
+ if (!file->f_op->mmap)
+ return -ENODEV;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
break;
default:
@@ -1008,6 +1332,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ return -EINVAL;
+ /*
+ * Ignore pgoff.
+ */
+ pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
@@ -1021,48 +1351,200 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
}
}
- error = security_file_mmap(file, reqprot, prot, flags);
- if (error)
- return error;
-
+ /*
+ * Set 'VM_NORESERVE' if we should not account for the
+ * memory use of this mapping.
+ */
+ if (flags & MAP_NORESERVE) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
+
+ addr = mmap_region(file, addr, len, vm_flags, pgoff);
+ if (!IS_ERR_VALUE(addr) &&
+ ((vm_flags & VM_LOCKED) ||
+ (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
+ *populate = len;
+ return addr;
+}
+
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval = -EBADF;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ audit_mmap_fd(fd, flags);
+ file = fget(fd);
+ if (!file)
+ goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
+ retval = -EINVAL;
+ if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ goto out_fput;
+ } else if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ struct hstate *hs;
+
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+ if (!hs)
+ return -EINVAL;
+
+ len = ALIGN(len, huge_page_size(hs));
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
+ VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE,
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
+/*
+ * Some shared mappigns will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+int vma_wants_writenotify(struct vm_area_struct *vma)
+{
+ vm_flags_t vm_flags = vma->vm_flags;
+
+ /* If it was private or non-writable, the write bit is already clear */
+ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
+ return 0;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+ return 1;
+
+ /* The open routine did something to the protections already? */
+ if (pgprot_val(vma->vm_page_prot) !=
+ pgprot_val(vm_get_page_prot(vm_flags)))
+ return 0;
+
+ /* Specialty mapping? */
+ if (vm_flags & VM_PFNMAP)
+ return 0;
+
+ /* Can the mapping track the dirty pages? */
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_cap_account_dirty(vma->vm_file->f_mapping);
+}
+
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
+{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
+ return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
+ int error;
+ struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = 0;
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ if (!(vm_flags & MAP_FIXED))
+ return -ENOMEM;
+
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
- if (accountable && (!(flags & MAP_NORESERVE) ||
- sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
- if (vm_flags & VM_SHARED) {
- /* Check memory availability in shmem_file_setup? */
- vm_flags |= VM_ACCOUNT;
- } else if (vm_flags & VM_WRITE) {
- /*
- * Private writable mapping: check memory availability
- */
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
}
/*
- * Can we just expand an old private anonymous mapping?
- * The VM_SHARED test is necessary because shmem_zero_setup
- * will create the file object for a shared anonymous map below.
+ * Can we just expand an old mapping?
*/
- if (!file && !(vm_flags & VM_SHARED) &&
- vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, NULL, pgoff, NULL))
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ if (vma)
goto out;
/*
@@ -1080,85 +1562,87 @@ munmap_back:
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
- vma->vm_page_prot = protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {
- error = -EINVAL;
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
if (vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
- correct_wcount = 1;
}
- vma->vm_file = file;
- get_file(file);
+ vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
+
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ * Bug: If addr is changed, prev, rb_link, rb_parent should
+ * be updated for vma_link()
+ */
+ WARN_ON_ONCE(addr != vma->vm_start);
+
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
- /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
- * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
- * that memory reservation must be checked; but that reservation
- * belongs to shared memory object, not to vma: so now clear it.
- */
- if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
- vma->vm_flags &= ~VM_ACCOUNT;
+ if (vma_wants_writenotify(vma)) {
+ pgprot_t pprot = vma->vm_page_prot;
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- */
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
-
- if (vma_wants_writenotify(vma))
- vma->vm_page_prot =
- protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
-
- if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
- vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
- file = vma->vm_file;
- vma_link(mm, vma, prev, rb_link, rb_parent);
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- } else {
- if (file) {
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
- fput(file);
- }
- mpol_free(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ /* Can vma->vm_page_prot have changed??
+ *
+ * Answer: Yes, drivers may have changed it in their
+ * f_op->mmap method.
+ *
+ * Ensures that vmas marked as uncached stay that way.
+ */
+ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+ if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
}
-out:
- mm->total_vm += len >> PAGE_SHIFT;
+
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ /* Once vma denies write, undo our temporary denial count */
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ file = vma->vm_file;
+out:
+ perf_event_mmap(vma);
+
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
- }
- if (flags & MAP_POPULATE) {
- up_write(&mm->mmap_sem);
- sys_remap_file_pages(addr, len, 0,
- pgoff, flags & MAP_NONBLOCK);
- down_write(&mm->mmap_sem);
+ if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm)))
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ else
+ vma->vm_flags &= ~VM_LOCKED;
}
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
return addr;
unmap_and_free_vma:
- if (correct_wcount)
- atomic_inc(&inode->i_writecount);
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
vma->vm_file = NULL;
fput(file);
@@ -1173,7 +1657,205 @@ unacct_error:
return error;
}
-EXPORT_SYMBOL(do_mmap_pgoff);
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ /*
+ * We implement the search by looking for an rbtree node that
+ * immediately follows a suitable gap. That is,
+ * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+ * - gap_end = vma->vm_start >= info->low_limit + length;
+ * - gap_end - gap_start >= length
+ */
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /* Adjust search limits by the desired length */
+ if (info->high_limit < length)
+ return -ENOMEM;
+ high_limit = info->high_limit - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ goto check_highest;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ goto check_highest;
+
+ while (true) {
+ /* Visit left subtree if it looks promising */
+ gap_end = vma->vm_start;
+ if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+check_current:
+ /* Check if current node has a suitable gap */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+ if (gap_end >= low_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit right subtree if it looks promising */
+ if (vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ goto check_highest;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_left) {
+ gap_start = vma->vm_prev->vm_end;
+ gap_end = vma->vm_start;
+ goto check_current;
+ }
+ }
+ }
+
+check_highest:
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+
+found:
+ /* We found a suitable gap. Clip it with the original low_limit. */
+ if (gap_start < info->low_limit)
+ gap_start = info->low_limit;
+
+ /* Adjust gap address to the desired alignment */
+ gap_start += (info->align_offset - gap_start) & info->align_mask;
+
+ VM_BUG_ON(gap_start + info->length > info->high_limit);
+ VM_BUG_ON(gap_start + info->length > gap_end);
+ return gap_start;
+}
+
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /*
+ * Adjust search limits by the desired length.
+ * See implementation comment at top of unmapped_area().
+ */
+ gap_end = info->high_limit;
+ if (gap_end < length)
+ return -ENOMEM;
+ high_limit = gap_end - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ if (gap_start <= high_limit)
+ goto found_highest;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ return -ENOMEM;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ return -ENOMEM;
+
+ while (true) {
+ /* Visit right subtree if it looks promising */
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+check_current:
+ /* Check if current node has a suitable gap */
+ gap_end = vma->vm_start;
+ if (gap_end < low_limit)
+ return -ENOMEM;
+ if (gap_start <= high_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit left subtree if it looks promising */
+ if (vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ return -ENOMEM;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_right) {
+ gap_start = vma->vm_prev ?
+ vma->vm_prev->vm_end : 0;
+ goto check_current;
+ }
+ }
+ }
+
+found:
+ /* We found a suitable gap. Clip it with the original high_limit. */
+ if (gap_end > info->high_limit)
+ gap_end = info->high_limit;
+
+found_highest:
+ /* Compute highest gap address at the desired alignment */
+ gap_end -= info->length;
+ gap_end -= (gap_end - info->align_offset) & info->align_mask;
+
+ VM_BUG_ON(gap_end < info->low_limit);
+ VM_BUG_ON(gap_end < gap_start);
+ return gap_end;
+}
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
@@ -1193,66 +1875,31 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info;
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- addr = TASK_UNMAPPED_BASE;
- start_addr = addr;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = mm->mmap_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = 0;
+ return vm_unmapped_area(&info);
}
#endif
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the lowest possible address?
- */
- if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
- mm->free_area_cache = addr;
- mm->cached_hole_size = ~0UL;
- }
-}
-
/*
* This mmap-allocator allocates new areas top-down from below the
* stack's low limit (the base):
@@ -1266,207 +1913,139 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE)
+ if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
- /* either no address requested or can't fit in requested address hole */
- addr = mm->free_area_cache;
-
- /* make sure it can fit in the remaining address space */
- if (addr > len) {
- vma = find_vma(mm, addr-len);
- if (!vma || addr <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = mm->mmap_base;
+ info.align_mask = 0;
+ addr = vm_unmapped_area(&info);
- if (mm->mmap_base < len)
- goto bottomup;
-
- addr = mm->mmap_base-len;
-
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
-
-bottomup:
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
#endif
-void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
-{
- /*
- * Is this a new hole at the highest possible address?
- */
- if (addr > mm->free_area_cache)
- mm->free_area_cache = addr;
-
- /* dont allow allocations above current base */
- if (mm->free_area_cache > mm->mmap_base)
- mm->free_area_cache = mm->mmap_base;
-}
-
unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- unsigned long ret;
+ unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
- if (!(flags & MAP_FIXED)) {
- unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ unsigned long error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
- addr = get_area(file, addr, len, pgoff, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
- }
+ /* Careful about overflows.. */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (addr & ~PAGE_MASK)
return -EINVAL;
- if (file && is_file_hugepages(file)) {
- /*
- * Check if the given range is hugepage aligned, and
- * can be made suitable for hugepages.
- */
- ret = prepare_hugepage_range(addr, len);
- } else {
- /*
- * Ensure that a normal request is not falling in a
- * reserved hugepage range. For some archs like IA-64,
- * there is a separate region for hugepages.
- */
- ret = is_hugepage_only_range(current->mm, addr, len);
- }
- if (ret)
- return -EINVAL;
- return addr;
+
+ addr = arch_rebalance_pgtables(addr, len);
+ error = security_mmap_addr(addr);
+ return error ? error : addr;
}
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
- if (mm) {
- /* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node * rb_node;
-
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
-
- while (rb_node) {
- struct vm_area_struct * vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
- }
+ /* Check the cache first. */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
+
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
EXPORT_SYMBOL(find_vma);
-/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
- struct vm_area_struct *vma = NULL, *prev = NULL;
- struct rb_node * rb_node;
- if (!mm)
- goto out;
-
- /* Guard against addr being lower than the first VMA */
- vma = mm->mmap;
-
- /* Go through the RB tree quickly. */
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
- vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ struct vm_area_struct *vma;
- if (addr < vma_tmp->vm_end) {
- rb_node = rb_node->rb_left;
- } else {
- prev = vma_tmp;
- if (!prev->vm_next || (addr < prev->vm_next->vm_end))
- break;
+ vma = find_vma(mm, addr);
+ if (vma) {
+ *pprev = vma->vm_prev;
+ } else {
+ struct rb_node *rb_node = mm->mm_rb.rb_node;
+ *pprev = NULL;
+ while (rb_node) {
+ *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
rb_node = rb_node->rb_right;
}
}
-
-out:
- *pprev = prev;
- return prev ? prev->vm_next : vma;
+ return vma;
}
/*
@@ -1474,17 +2053,18 @@ out:
* update accounting. This is shared with both the
* grow-up and grow-down cases.
*/
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
+ unsigned long new_start;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- if (size > rlim[RLIMIT_STACK].rlim_cur)
+ if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -1492,20 +2072,26 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
}
+ /* Check to ensure the stack will not grow into a hugetlb-only region */
+ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
+ vma->vm_end - size;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size))
+ return -EFAULT;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
- if (security_vm_enough_memory(grow))
+ if (security_vm_enough_memory_mm(mm, grow))
return -ENOMEM;
/* Ok, everything looks good - let it rip */
- mm->total_vm += grow;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1517,9 +2103,6 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifndef CONFIG_IA64
-static inline
-#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1533,15 +2116,20 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
+ * Also guard against wrapping around to address 0.
*/
- address += 4 + PAGE_SIZE - 1;
- address &= PAGE_MASK;
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else {
+ vma_unlock_anon_vma(vma);
+ return -ENOMEM;
+ }
error = 0;
/* Somebody else might have raced and expanded it already */
@@ -1551,42 +2139,47 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error)
- vma->vm_end = address;
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_end = address;
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ vma->vm_mm->highest_vm_end = address;
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ perf_event_mmap(vma);
+ }
+ }
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
-#ifdef CONFIG_STACK_GROWSUP
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
-{
- return expand_upwards(vma, address);
-}
-
-struct vm_area_struct *
-find_extend_vma(struct mm_struct *mm, unsigned long addr)
-{
- struct vm_area_struct *vma, *prev;
-
- addr &= PAGE_MASK;
- vma = find_vma_prev(mm, addr, &prev);
- if (vma && (vma->vm_start <= addr))
- return vma;
- if (!prev || expand_stack(prev, addr))
- return NULL;
- if (prev->vm_flags & VM_LOCKED) {
- make_pages_present(addr, prev->vm_end);
- }
- return prev;
-}
-#else
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+int expand_downwards(struct vm_area_struct *vma,
+ unsigned long address)
{
int error;
@@ -1596,15 +2189,19 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+
+ address &= PAGE_MASK;
+ error = security_mmap_addr(address);
+ if (error)
+ return error;
+
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
- address &= PAGE_MASK;
- error = 0;
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -1613,16 +2210,93 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ anon_vma_interval_tree_post_update_vma(vma);
+ vma_gap_update(vma);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ perf_event_mmap(vma);
+ }
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
+/*
+ * Note how expand_stack() refuses to expand the stack all the way to
+ * abut the next virtual mapping, *unless* that mapping itself is also
+ * a stack mapping. We want to leave room for a guard page, after all
+ * (the guard page itself is not added here, that is done by the
+ * actual page faulting logic)
+ *
+ * This matches the behavior of the guard page logic (see mm/memory.c:
+ * check_stack_guard_page()), which only allows the guard page to be
+ * removed under these circumstances.
+ */
+#ifdef CONFIG_STACK_GROWSUP
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ struct vm_area_struct *next;
+
+ address &= PAGE_MASK;
+ next = vma->vm_next;
+ if (next && next->vm_start == address + PAGE_SIZE) {
+ if (!(next->vm_flags & VM_GROWSUP))
+ return -ENOMEM;
+ }
+ return expand_upwards(vma, address);
+}
+
+struct vm_area_struct *
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma, *prev;
+
+ addr &= PAGE_MASK;
+ vma = find_vma_prev(mm, addr, &prev);
+ if (vma && (vma->vm_start <= addr))
+ return vma;
+ if (!prev || expand_stack(prev, addr))
+ return NULL;
+ if (prev->vm_flags & VM_LOCKED)
+ __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ return prev;
+}
+#else
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ struct vm_area_struct *prev;
+
+ address &= PAGE_MASK;
+ prev = vma->vm_prev;
+ if (prev && prev->vm_end == address) {
+ if (!(prev->vm_flags & VM_GROWSDOWN))
+ return -ENOMEM;
+ }
+ return expand_downwards(vma, address);
+}
+
struct vm_area_struct *
find_extend_vma(struct mm_struct * mm, unsigned long addr)
{
@@ -1640,9 +2314,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
- if (vma->vm_flags & VM_LOCKED) {
- make_pages_present(addr, start);
- }
+ if (vma->vm_flags & VM_LOCKED)
+ __mlock_vma_pages_range(vma, addr, start, NULL);
return vma;
}
#endif
@@ -1655,17 +2328,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ unsigned long nr_accounted = 0;
+
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
- mm->total_vm -= nrpages;
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= nrpages;
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
+ vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -1679,17 +2354,15 @@ static void unmap_region(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
- unsigned long nr_accounted = 0;
+ struct mmu_gather tlb;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
- vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
+ unmap_vmas(&tlb, vma, start, end);
+ free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -1702,48 +2375,50 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
do {
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
+ if (vma) {
+ vma->vm_prev = prev;
+ vma_gap_update(vma);
+ } else
+ mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
- * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the the tail.
+ * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
+ * munmap path where it doesn't make sense to fail.
*/
-int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
{
- struct mempolicy *pol;
struct vm_area_struct *new;
+ int err = -ENOMEM;
- if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+ if (is_vm_hugetlb_page(vma) && (addr &
+ ~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
- if (mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
-
- new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- return -ENOMEM;
+ goto out_err;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+
if (new_below)
new->vm_end = addr;
else {
@@ -1751,12 +2426,12 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
- pol = mpol_copy(vma_policy(vma));
- if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new);
- return PTR_ERR(pol);
- }
- vma_set_policy(new, pol);
+ err = vma_dup_policy(vma, new);
+ if (err)
+ goto out_free_vma;
+
+ if (anon_vma_clone(new, vma))
+ goto out_free_mpol;
if (new->vm_file)
get_file(new->vm_file);
@@ -1765,12 +2440,40 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
new->vm_ops->open(new);
if (new_below)
- vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+ err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
- vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- return 0;
+ /* Success. */
+ if (!err)
+ return 0;
+
+ /* Clean everything up if vma_adjust failed. */
+ if (new->vm_ops && new->vm_ops->close)
+ new->vm_ops->close(new);
+ if (new->vm_file)
+ fput(new->vm_file);
+ unlink_anon_vmas(new);
+ out_free_mpol:
+ mpol_put(vma_policy(new));
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new);
+ out_err:
+ return err;
+}
+
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ return __split_vma(mm, vma, addr, new_below);
}
/* Munmap is split into 2 main parts -- this part which finds
@@ -1790,9 +2493,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return -EINVAL;
/* Find the first overlapping VMA */
- vma = find_vma_prev(mm, start, &prev);
+ vma = find_vma(mm, start);
if (!vma)
return 0;
+ prev = vma->vm_prev;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
@@ -1808,7 +2512,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
* places tmp vma above, and higher split_vma places tmp vma below.
*/
if (start > vma->vm_start) {
- int error = split_vma(mm, vma, start, 0);
+ int error;
+
+ /*
+ * Make sure that map_count on return from munmap() will
+ * not exceed its limit; but let map_count go just above
+ * its limit temporarily, to help free resources as expected.
+ */
+ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
@@ -1817,13 +2531,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* Does it split the last one? */
last = find_vma(mm, end);
if (last && end > last->vm_start) {
- int error = split_vma(mm, last, end, 1);
+ int error = __split_vma(mm, last, end, 1);
if (error)
return error;
}
vma = prev? prev->vm_next: mm->mmap;
/*
+ * unlock any mlock()ed ranges before detaching vmas
+ */
+ if (mm->locked_vm) {
+ struct vm_area_struct *tmp = vma;
+ while (tmp && tmp->vm_start < end) {
+ if (tmp->vm_flags & VM_LOCKED) {
+ mm->locked_vm -= vma_pages(tmp);
+ munlock_vma_pages_all(tmp);
+ }
+ tmp = tmp->vm_next;
+ }
+ }
+
+ /*
* Remove the vma's, and unmap the actual pages
*/
detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -1835,20 +2563,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
-EXPORT_SYMBOL(do_munmap);
-
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+int vm_munmap(unsigned long start, size_t len)
{
int ret;
struct mm_struct *mm = current->mm;
- profile_munmap(addr);
-
down_write(&mm->mmap_sem);
- ret = do_munmap(mm, addr, len);
+ ret = do_munmap(mm, start, len);
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ profile_munmap(addr);
+ return vm_munmap(addr, len);
+}
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
@@ -1865,7 +2596,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
@@ -1878,27 +2609,15 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (!len)
return addr;
- if ((addr + len) > TASK_SIZE || (addr + len) < addr)
- return -EINVAL;
-
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = arch_mmap_check(addr, len, flags);
- if (error)
+ error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (error & ~PAGE_MASK)
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
@@ -1910,8 +2629,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
* Clear old maps. this also does some error checking for us
*/
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
@@ -1924,12 +2642,13 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- if (security_vm_enough_memory(len >> PAGE_SHIFT))
+ if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
- if (vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL))
+ vma = vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL);
+ if (vma)
goto out;
/*
@@ -1941,61 +2660,97 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
return -ENOMEM;
}
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
- vma->vm_page_prot = protection_map[flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
+ perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED) {
- mm->locked_vm += len >> PAGE_SHIFT;
- make_pages_present(addr, addr + len);
- }
+ if (flags & VM_LOCKED)
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
-EXPORT_SYMBOL(do_brk);
+unsigned long vm_brk(unsigned long addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long ret;
+ bool populate;
+
+ down_write(&mm->mmap_sem);
+ ret = do_brk(addr, len);
+ populate = ((mm->def_flags & VM_LOCKED) != 0);
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(addr, len);
+ return ret;
+}
+EXPORT_SYMBOL(vm_brk);
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
- struct mmu_gather *tlb;
- struct vm_area_struct *vma = mm->mmap;
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
- unsigned long end;
+
+ /* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+ if (vma->vm_flags & VM_LOCKED)
+ munlock_vma_pages_all(vma);
+ vma = vma->vm_next;
+ }
+ }
+
+ arch_exit_mmap(mm);
+
+ vma = mm->mmap;
+ if (!vma) /* Can happen if dup_mmap() received an OOM */
+ return;
lru_add_drain();
flush_cache_mm(mm);
- tlb = tlb_gather_mmu(mm, 1);
- /* Don't update_hiwater_rss(mm) here, do_exit already did */
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ /* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
- vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
- tlb_finish_mmu(tlb, 0, end);
+ unmap_vmas(&tlb, vma, 0, -1);
+
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ tlb_finish_mmu(&tlb, 0, -1);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
- while (vma)
+ while (vma) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ }
+ vm_unacct_memory(nr_accounted);
- BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ WARN_ON(atomic_long_read(&mm->nr_ptes) >
+ (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
* and into the inode's i_mmap tree. If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
*/
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2013,12 +2768,13 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
- if (__vma && __vma->vm_start < vma->vm_end)
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory(vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
+
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
@@ -2028,53 +2784,78 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
* prior to moving page table entries, to effect an mremap move.
*/
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- unsigned long addr, unsigned long len, pgoff_t pgoff)
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks)
{
struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
- struct mempolicy *pol;
+ bool faulted_in_anon_vma = true;
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (!vma->vm_file && !vma->anon_vma)
+ if (unlikely(!vma->vm_file && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
+ faulted_in_anon_vma = false;
+ }
- find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
*/
- if (vma_start >= new_vma->vm_start &&
- vma_start < new_vma->vm_end)
- *vmap = new_vma;
+ if (unlikely(vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)) {
+ /*
+ * The only way we can get a vma_merge with
+ * self during an mremap is if the vma hasn't
+ * been faulted in yet and we were allowed to
+ * reset the dst vma->vm_pgoff to the
+ * destination address of the mremap to allow
+ * the merge to happen. mremap must change the
+ * vm_pgoff linearity between src and dst vmas
+ * (in turn preventing a vma_merge) to be
+ * safe. It is only safe to keep the vm_pgoff
+ * linear if there are no pages mapped yet.
+ */
+ VM_BUG_ON(faulted_in_anon_vma);
+ *vmap = vma = new_vma;
+ }
+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
- new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
- pol = mpol_copy(vma_policy(vma));
- if (IS_ERR(pol)) {
- kmem_cache_free(vm_area_cachep, new_vma);
- return NULL;
- }
- vma_set_policy(new_vma, pol);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
new_vma->vm_pgoff = pgoff;
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
}
return new_vma;
+
+ out_free_mempol:
+ mpol_put(vma_policy(new_vma));
+ out_free_vma:
+ kmem_cache_free(vm_area_cachep, new_vma);
+ return NULL;
}
/*
@@ -2086,9 +2867,424 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
unsigned long cur = mm->total_vm; /* pages */
unsigned long lim;
- lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
if (cur + npages > lim)
return 0;
return 1;
}
+
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const char *special_mapping_name(struct vm_area_struct *vma)
+{
+ return ((struct vm_special_mapping *)vma->vm_private_data)->name;
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+ .name = special_mapping_name,
+};
+
+static const struct vm_operations_struct legacy_special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
+
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ pgoff_t pgoff;
+ struct page **pages;
+
+ /*
+ * special mappings have no vm_file, and in that case, the mm
+ * uses vm_pgoff internally. So we have to subtract it from here.
+ * We are allowed to do this because we are the mm; do not copy
+ * this code into drivers!
+ */
+ pgoff = vmf->pgoff - vma->vm_pgoff;
+
+ if (vma->vm_ops == &legacy_special_mapping_vmops)
+ pages = vma->vm_private_data;
+ else
+ pages = ((struct vm_special_mapping *)vma->vm_private_data)->
+ pages;
+
+ for (; pgoff && *pages; ++pages)
+ pgoff--;
+
+ if (*pages) {
+ struct page *page = *pages;
+ get_page(page);
+ vmf->page = page;
+ return 0;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_area_struct *__install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_operations_struct *ops,
+ void *priv)
+{
+ int ret;
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (unlikely(vma == NULL))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ vma->vm_ops = ops;
+ vma->vm_private_data = priv;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
+
+ mm->total_vm += len >> PAGE_SHIFT;
+
+ perf_event_mmap(vma);
+
+ return vma;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+struct vm_area_struct *_install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_special_mapping *spec)
+{
+ return __install_special_mapping(mm, addr, len, vm_flags,
+ &special_mapping_vmops, (void *)spec);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma = __install_special_mapping(
+ mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
+ (void *)pages);
+
+ return PTR_ERR_OR_ZERO(vma);
+}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->root->rwsem. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->root->rwsem.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_node))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_lock_anon_vma(mm, avc->anon_vma);
+ }
+
+ return 0;
+
+out_unlock:
+ mm_drop_all_locks(mm);
+ return -EINTR;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->rb_root will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->root->rwsem.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_node))
+ BUG();
+ anon_vma_unlock_write(anon_vma);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ mutex_unlock(&mapping->i_mmap_mutex);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_unlock_anon_vma(avc->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
+
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+ int ret;
+
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
+}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+subsys_initcall(init_user_reserve);
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+subsys_initcall(init_admin_reserve);
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
+
+ return 0;
+}
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 00000000000..f802c2d216a
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,62 @@
+/* Copyright (C) 2009 Red Hat, Inc.
+ *
+ * See ../COPYING for licensing terms.
+ */
+
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+
+#include <asm/mmu_context.h>
+
+/*
+ * use_mm
+ * Makes the calling kernel thread take on the specified
+ * mm context.
+ * (Note: this routine is intended to be called only
+ * from a kernel thread context)
+ */
+void use_mm(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm;
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ active_mm = tsk->active_mm;
+ if (active_mm != mm) {
+ atomic_inc(&mm->mm_count);
+ tsk->active_mm = mm;
+ }
+ tsk->mm = mm;
+ switch_mm(active_mm, mm, tsk);
+ task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
+
+ if (active_mm != mm)
+ mmdrop(active_mm);
+}
+EXPORT_SYMBOL_GPL(use_mm);
+
+/*
+ * unuse_mm
+ * Reverses the effect of use_mm, i.e. releases the
+ * specified mm context which was earlier taken on
+ * by the calling kernel thread
+ * (Note: this routine is intended to be called only
+ * from a kernel thread context)
+ */
+void unuse_mm(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+ sync_mm_rss(mm);
+ tsk->mm = NULL;
+ /* active_mm is still 'mm' */
+ enter_lazy_tlb(mm, tsk);
+ task_unlock(tsk);
+}
+EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 00000000000..41cefdf0aad
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,332 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <clameter@sgi.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/srcu.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ /*
+ * SRCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * If ->release runs before mmu_notifier_unregister it must be
+ * handled, as it's the only way for the driver to flush all
+ * existing sptes and stop the driver from establishing any more
+ * sptes before all the pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+ mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+ struct mmu_notifier,
+ hlist);
+ /*
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than to wait
+ * for ->release to finish and for mmu_notifier_unregister to
+ * return.
+ */
+ hlist_del_init_rcu(&mn->hlist);
+ }
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * synchronize_srcu here prevents mmu_notifier_release from returning to
+ * exit_mmap (which would proceed with freeing all pages in the mm)
+ * until the ->release method returns, if it was invoked by
+ * mmu_notifier_unregister.
+ *
+ * The mmu_notifier_mm can't go away from under us because one mm_count
+ * is held by exit_mmap.
+ */
+ synchronize_srcu(&srcu);
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->clear_flush_young)
+ young |= mn->ops->clear_flush_young(mn, mm, address);
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ int young = 0, id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->test_young) {
+ young = mn->ops->test_young(mn, mm, address);
+ if (young)
+ break;
+ }
+ }
+ srcu_read_unlock(&srcu, id);
+
+ return young;
+}
+
+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->change_pte)
+ mn->ops->change_pte(mn, mm, address, pte);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_start)
+ mn->ops->invalidate_range_start(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
+
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_range_end)
+ mn->ops->invalidate_range_end(mn, mm, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
+
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ int take_mmap_sem)
+{
+ struct mmu_notifier_mm *mmu_notifier_mm;
+ int ret;
+
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+
+ /*
+ * Verify that mmu_notifier_init() already run and the global srcu is
+ * initialized.
+ */
+ BUG_ON(!srcu.per_cpu_ref);
+
+ ret = -ENOMEM;
+ mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (unlikely(!mmu_notifier_mm))
+ goto out;
+
+ if (take_mmap_sem)
+ down_write(&mm->mmap_sem);
+ ret = mm_take_all_locks(mm);
+ if (unlikely(ret))
+ goto out_clean;
+
+ if (!mm_has_notifiers(mm)) {
+ INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+ spin_lock_init(&mmu_notifier_mm->lock);
+
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+ mmu_notifier_mm = NULL;
+ }
+ atomic_inc(&mm->mm_count);
+
+ /*
+ * Serialize the update against mmu_notifier_unregister. A
+ * side note: mmu_notifier_release can't run concurrently with
+ * us because we hold the mm_users pin (either implicitly as
+ * current->mm or explicitly with get_task_mm() or similar).
+ * We can't race against any other mmu notifier method either
+ * thanks to mm_take_all_locks().
+ */
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ mm_drop_all_locks(mm);
+out_clean:
+ if (take_mmap_sem)
+ up_write(&mm->mmap_sem);
+ kfree(mmu_notifier_mm);
+out:
+ BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return ret;
+}
+
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+ BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+ kfree(mm->mmu_notifier_mm);
+ mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ if (!hlist_unhashed(&mn->hlist)) {
+ /*
+ * SRCU here will force exit_mmap to wait for ->release to
+ * finish before freeing the pages.
+ */
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ /*
+ * exit_mmap will block in mmu_notifier_release to guarantee
+ * that ->release is called before freeing the pages.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ /*
+ * Can not use list_del_rcu() since __mmu_notifier_release
+ * can delete it before we hold the lock.
+ */
+ hlist_del_init_rcu(&mn->hlist);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
+
+ /*
+ * Wait for any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_release instead of us.
+ */
+ synchronize_srcu(&srcu);
+
+ BUG_ON(atomic_read(&mm->mm_count) <= 0);
+
+ mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+ return init_srcu_struct(&srcu);
+}
+subsys_initcall(mmu_notifier_init);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index febea1c9816..bf34fb8556d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,21 +1,19 @@
/*
* linux/mm/mmzone.c
*
- * management codes for pgdats and zones.
+ * management codes for pgdats, zones and page flags
*/
#include <linux/stddef.h>
+#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/module.h>
struct pglist_data *first_online_pgdat(void)
{
return NODE_DATA(first_online_node);
}
-EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */
-
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
int nid = next_online_node(pgdat->node_id);
@@ -24,8 +22,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
return NULL;
return NODE_DATA(nid);
}
-EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */
-
/*
* next_zone - helper magic for for_each_zone()
@@ -45,5 +41,76 @@ struct zone *next_zone(struct zone *zone)
}
return zone;
}
-EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+ return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+ return 1;
+#endif /* CONFIG_NUMA */
+}
+
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+ enum zone_type highest_zoneidx,
+ nodemask_t *nodes,
+ struct zone **zone)
+{
+ /*
+ * Find the next suitable zone to use for the allocation.
+ * Only filter based on nodemask if it's set
+ */
+ if (likely(nodes == NULL))
+ while (zonelist_zone_idx(z) > highest_zoneidx)
+ z++;
+ else
+ while (zonelist_zone_idx(z) > highest_zoneidx ||
+ (z->zone && !zref_in_nodemask(z, nodes)))
+ z++;
+
+ *zone = zonelist_zone(z);
+ return z;
+}
+
+#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
+int memmap_valid_within(unsigned long pfn,
+ struct page *page, struct zone *zone)
+{
+ if (page_to_pfn(page) != pfn)
+ return 0;
+
+ if (page_zone(page) != zone)
+ return 0;
+
+ return 1;
+}
+#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+void lruvec_init(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+
+ memset(lruvec, 0, sizeof(struct lruvec));
+
+ for_each_lru(lru)
+ INIT_LIST_HEAD(&lruvec->lists[lru]);
+}
+
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+int page_cpupid_xchg_last(struct page *page, int cpupid)
+{
+ unsigned long old_flags, flags;
+ int last_cpupid;
+
+ do {
+ old_flags = flags = page->flags;
+ last_cpupid = page_cpupid_last(page);
+
+ flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
+ flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+
+ return last_cpupid;
+}
+#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38a..c43d557941f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -4,13 +4,12 @@
* (C) Copyright 1994 Linus Torvalds
* (C) Copyright 2002 Christoph Hellwig
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <linux/slab.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
@@ -21,112 +20,243 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
+#include <linux/perf_event.h>
+#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+#ifndef pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ return newprot;
+}
+#endif
+
+/*
+ * For a prot_numa update we only hold mmap_sem for read so there is a
+ * potential race with faulting where a pmd was temporarily none. This
+ * function checks for a transhuge pmd under the appropriate lock. It
+ * returns a pte if it was successfully locked or NULL if it raced with
+ * a transhuge insertion.
+ */
+static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, int prot_numa, spinlock_t **ptl)
+{
+ pte_t *pte;
+ spinlock_t *pmdl;
+
+ /* !prot_numa is protected by mmap_sem held for write */
+ if (!prot_numa)
+ return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+
+ pmdl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
+ spin_unlock(pmdl);
+ return NULL;
+ }
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+ spin_unlock(pmdl);
+ return pte;
+}
+
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
+ struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
+ unsigned long pages = 0;
+
+ pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ if (!pte)
+ return 0;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
+ bool updated = false;
- /* Avoid an SMP race with hardware updated dirty/clean
- * bits by wiping the pte and then setting the new pte
- * into place.
- */
- ptent = ptep_get_and_clear(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
- /*
- * Avoid taking write faults for pages we know to be
- * dirty.
- */
- if (dirty_accountable && pte_dirty(ptent))
- ptent = pte_mkwrite(ptent);
- set_pte_at(mm, addr, pte, ptent);
- lazy_mmu_prot_update(ptent);
-#ifdef CONFIG_MIGRATION
- } else if (!pte_file(oldpte)) {
+ if (!prot_numa) {
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ if (pte_numa(ptent))
+ ptent = pte_mknonnuma(ptent);
+ ptent = pte_modify(ptent, newprot);
+ /*
+ * Avoid taking write faults for pages we
+ * know to be dirty.
+ */
+ if (dirty_accountable && pte_dirty(ptent))
+ ptent = pte_mkwrite(ptent);
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
+ updated = true;
+ } else {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page && !PageKsm(page)) {
+ if (!pte_numa(oldpte)) {
+ ptep_set_numa(mm, addr, pte);
+ updated = true;
+ }
+ }
+ }
+ if (updated)
+ pages++;
+ } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
+ pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
- set_pte_at(mm, addr, pte,
- swp_entry_to_pte(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ set_pte_at(mm, addr, pte, newpte);
+
+ pages++;
}
-#endif
}
-
} while (pte++, addr += PAGE_SIZE, addr != end);
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+
+ return pages;
}
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long next;
+ unsigned long pages = 0;
+ unsigned long nr_huge_updates = 0;
+ unsigned long mni_start = 0;
pmd = pmd_offset(pud, addr);
do {
+ unsigned long this_pages;
+
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!mni_start) {
+ mni_start = addr;
+ mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ }
+
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma, addr, pmd);
+ else {
+ int nr_ptes = change_huge_pmd(vma, pmd, addr,
+ newprot, prot_numa);
+
+ if (nr_ptes) {
+ if (nr_ptes == HPAGE_PMD_NR) {
+ pages += HPAGE_PMD_NR;
+ nr_huge_updates++;
+ }
+
+ /* huge pmd was handled */
+ continue;
+ }
+ }
+ /* fall through, the trans huge pmd just split */
+ }
+ this_pages = change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa);
+ pages += this_pages;
} while (pmd++, addr = next, addr != end);
+
+ if (mni_start)
+ mmu_notifier_invalidate_range_end(mm, mni_start, end);
+
+ if (nr_huge_updates)
+ count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
+ return pages;
}
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
+ unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
+
+ return pages;
}
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
+ unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ pages += change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
- flush_tlb_range(vma, start, end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+ clear_tlb_flush_pending(mm);
+
+ return pages;
}
-static int
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa)
+{
+ unsigned long pages;
+
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+
+ return pages;
+}
+
+int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
{
@@ -146,15 +276,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
- * make it unwritable again.
- *
- * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
- * a MAP_NORESERVE private mapping to writable will now reserve.
+ * make it unwritable again. hugetlb mapping were accounted for
+ * even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
- if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
+ VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
- if (security_vm_enough_memory(charged))
+ if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
newflags |= VM_ACCOUNT;
}
@@ -191,20 +320,20 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- vma->vm_page_prot = protection_map[newflags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
+ vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
+ vm_get_page_prot(newflags));
+
if (vma_wants_writenotify(vma)) {
- vma->vm_page_prot = protection_map[newflags &
- (VM_READ|VM_WRITE|VM_EXEC)];
+ vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
dirty_accountable = 1;
}
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
- else
- change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ change_protection(vma, start, end, vma->vm_page_prot,
+ dirty_accountable, 0);
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ perf_event_mmap(vma);
return 0;
fail:
@@ -212,8 +341,8 @@ fail:
return error;
}
-asmlinkage long
-sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+ unsigned long, prot)
{
unsigned long vm_flags, nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
@@ -231,7 +360,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
end = start + len;
if (end <= start)
return -ENOMEM;
- if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
+ if (!arch_validate_prot(prot))
return -EINVAL;
reqprot = prot;
@@ -245,10 +374,11 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
down_write(&current->mm->mmap_sem);
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma(current->mm, start);
error = -ENOMEM;
if (!vma)
goto out;
+ prev = vma->vm_prev;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
@@ -256,8 +386,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
- }
- else {
+ } else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
@@ -273,9 +402,10 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
for (nstart = start ; ; ) {
unsigned long newflags;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+ newflags = vm_flags;
+ newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373a..05f1180e9f2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,26 +3,31 @@
*
* (C) Copyright 1996 Linus Torvalds
*
- * Address space accounting code <alan@redhat.com>
+ * Address space accounting code <alan@lxorguk.ukuu.org.uk>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <linux/slab.h>
#include <linux/shm.h>
+#include <linux/ksm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/swapops.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched/sysctl.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include "internal.h"
+
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -38,13 +43,14 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none(*pmd))
return NULL;
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -59,34 +65,66 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
if (!pmd)
return NULL;
- if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
- return NULL;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pmd;
}
+static pte_t move_soft_dirty_pte(pte_t pte)
+{
+ /*
+ * Set soft dirty bit so we can notice
+ * in userspace the ptes were moved.
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ if (pte_present(pte))
+ pte = pte_mksoft_dirty(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ else if (pte_file(pte))
+ pte = pte_file_mksoft_dirty(pte);
+#endif
+ return pte;
+}
+
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr)
+ unsigned long new_addr, bool need_rmap_locks)
{
struct address_space *mapping = NULL;
+ struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock vmtruncate out,
- * since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- spin_lock(&mapping->i_mmap_lock);
- if (new_vma->vm_truncate_count &&
- new_vma->vm_truncate_count != vma->vm_truncate_count)
- new_vma->vm_truncate_count = 0;
+ /*
+ * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+ * locks to ensure that rmap will always observe either the old or the
+ * new ptes. This is the easiest way to avoid races with
+ * truncate_pagecache(), page migration, etc...
+ *
+ * When need_rmap_locks is false, we use other ways to avoid
+ * such races:
+ *
+ * - During exec() shift_arg_pages(), we use a specially tagged vma
+ * which rmap call sites look for using is_vma_temporary_stack().
+ *
+ * - During mremap(), new_vma is often known to be placed after vma
+ * in rmap traversal order. This ensures rmap will always observe
+ * either the old pte, or the new pte, or both (the page table locks
+ * serialize access to individual ptes, but only rmap traversal
+ * order guarantees that we won't miss both the old and new ptes).
+ */
+ if (need_rmap_locks) {
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
+ }
+ if (vma->anon_vma) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock_write(anon_vma);
+ }
}
/*
@@ -94,68 +132,110 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_pte = pte_offset_map(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) {
if (pte_none(*old_pte))
continue;
- pte = ptep_clear_flush(vma, old_addr, old_pte);
- /* ZERO_PAGE can be dependant on virtual addr */
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+ pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
}
+ arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap_nested(new_pte - 1);
+ pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (anon_vma)
+ anon_vma_unlock_write(anon_vma);
if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
-static unsigned long move_page_tables(struct vm_area_struct *vma,
+unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len)
+ unsigned long new_addr, unsigned long len,
+ bool need_rmap_locks)
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
+ bool need_flush = false;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
+ mmun_start = old_addr;
+ mmun_end = old_end;
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
- if (next - 1 > old_end)
- next = old_end;
+ /* even if next overflowed, extent below will be ok */
extent = next - old_addr;
+ if (extent > old_end - old_addr)
+ extent = old_end - old_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
+ if (pmd_trans_huge(*old_pmd)) {
+ int err = 0;
+ if (extent == HPAGE_PMD_SIZE) {
+ VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ anon_vma_lock_write(vma->anon_vma);
+ err = move_huge_pmd(vma, new_vma, old_addr,
+ new_addr, old_end,
+ old_pmd, new_pmd);
+ if (need_rmap_locks)
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ if (err > 0) {
+ need_flush = true;
+ continue;
+ } else if (!err) {
+ split_huge_page_pmd(vma, old_addr, old_pmd);
+ }
+ VM_BUG_ON(pmd_trans_huge(*old_pmd));
+ }
+ if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
+ new_pmd, new_addr))
+ break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr);
+ new_vma, new_pmd, new_addr, need_rmap_locks);
+ need_flush = true;
}
+ if (likely(need_flush))
+ flush_tlb_range(vma, old_end-len, old_addr);
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
return len + old_addr - old_end; /* how much done */
}
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
- unsigned long new_len, unsigned long new_addr)
+ unsigned long new_len, unsigned long new_addr, bool *locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -165,6 +245,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0;
+ int err;
+ bool need_rmap_locks;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -173,19 +255,34 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ /*
+ * Advise KSM to break any KSM pages in the area to be moved:
+ * it would be confusing if they were to turn up at the new
+ * location, where they happen to coincide with different KSM
+ * pages recently unmapped. But leave vma->vm_flags as it was,
+ * so KSM can come around to merge on vma and new_vma afterwards.
+ */
+ err = ksm_madvise(vma, old_addr, old_addr + old_len,
+ MADV_UNMERGEABLE, &vm_flags);
+ if (err)
+ return err;
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ &need_rmap_locks);
if (!new_vma)
return -ENOMEM;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+ need_rmap_locks);
if (moved_len < old_len) {
/*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+ true);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
@@ -211,7 +308,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- mm->total_vm += new_len >> PAGE_SHIFT;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -230,14 +326,145 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
- if (new_len > old_len)
- make_pages_present(new_addr + old_len,
- new_addr + new_len);
+ *locked = true;
}
return new_addr;
}
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (!vma || vma->vm_start > addr)
+ goto Efault;
+
+ if (is_vm_hugetlb_page(vma))
+ goto Einval;
+
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ goto Efault;
+
+ /* Need to be careful about a growing mapping */
+ if (new_len > old_len) {
+ unsigned long pgoff;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+ goto Efault;
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ goto Einval;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ locked += new_len - old_len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ goto Eagain;
+ }
+
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ goto Enomem;
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ goto Efault;
+ *p = charged;
+ }
+
+ return vma;
+
+Efault: /* very odd choice for most of the cases, but... */
+ return ERR_PTR(-EFAULT);
+Einval:
+ return ERR_PTR(-EINVAL);
+Enomem:
+ return ERR_PTR(-ENOMEM);
+Eagain:
+ return ERR_PTR(-EAGAIN);
+}
+
+static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
+ unsigned long new_addr, unsigned long new_len, bool *locked)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+ unsigned long map_flags;
+
+ if (new_addr & ~PAGE_MASK)
+ goto out;
+
+ if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ goto out;
+
+ /* Check if the location we're moving into overlaps the
+ * old location at all, and fail if it does.
+ */
+ if ((new_addr <= addr) && (new_addr+new_len) > addr)
+ goto out;
+
+ if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ goto out;
+
+ ret = do_munmap(mm, new_addr, new_len);
+ if (ret)
+ goto out;
+
+ if (old_len >= new_len) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ old_len = new_len;
+ }
+
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ map_flags = MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (ret & ~PAGE_MASK)
+ goto out1;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+ if (!(ret & ~PAGE_MASK))
+ goto out;
+out1:
+ vm_unacct_memory(charged);
+
+out:
+ return ret;
+}
+
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+ unsigned long end = vma->vm_end + delta;
+ if (end < vma->vm_end) /* overflow */
+ return 0;
+ if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ return 0;
+ if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+ 0, MAP_FIXED) & ~PAGE_MASK)
+ return 0;
+ return 1;
+}
+
/*
* Expand (or shrink) an existing mapping, potentially moving it at the
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -245,20 +472,24 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
* This option implies MREMAP_MAYMOVE.
*/
-unsigned long do_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
+ bool locked = false;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
- goto out;
+ return ret;
+
+ if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
+ return ret;
if (addr & ~PAGE_MASK)
- goto out;
+ return ret;
old_len = PAGE_ALIGN(old_len);
new_len = PAGE_ALIGN(new_len);
@@ -269,30 +500,14 @@ unsigned long do_mremap(unsigned long addr,
* a zero new-len is nonsensical.
*/
if (!new_len)
- goto out;
+ return ret;
- /* new_addr is only valid if MREMAP_FIXED is specified */
- if (flags & MREMAP_FIXED) {
- if (new_addr & ~PAGE_MASK)
- goto out;
- if (!(flags & MREMAP_MAYMOVE))
- goto out;
-
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- goto out;
-
- /* Check if the location we're moving into overlaps the
- * old location at all, and fail if it does.
- */
- if ((new_addr <= addr) && (new_addr+new_len) > addr)
- goto out;
-
- if ((addr <= new_addr) && (addr+old_len) > new_addr)
- goto out;
+ down_write(&current->mm->mmap_sem);
- ret = do_munmap(mm, new_addr, new_len);
- if (ret)
- goto out;
+ if (flags & MREMAP_FIXED) {
+ ret = mremap_to(addr, old_len, new_addr, new_len,
+ &locked);
+ goto out;
}
/*
@@ -305,71 +520,36 @@ unsigned long do_mremap(unsigned long addr,
if (ret && old_len != new_len)
goto out;
ret = addr;
- if (!(flags & MREMAP_FIXED) || (new_addr == addr))
- goto out;
- old_len = new_len;
+ goto out;
}
/*
- * Ok, we need to grow.. or relocate.
+ * Ok, we need to grow..
*/
- ret = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (is_vm_hugetlb_page(vma)) {
- ret = -EINVAL;
- goto out;
- }
- /* We can't remap across vm area boundaries */
- if (old_len > vma->vm_end - addr)
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto out;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
- goto out;
- }
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- locked += new_len - old_len;
- ret = -EAGAIN;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto out;
- }
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (vma->vm_flags & VM_ACCOUNT) {
- charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- goto out_nc;
}
/* old_len exactly to the end of the area..
- * And we're not relocating the area.
*/
- if (old_len == vma->vm_end - addr &&
- !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
- (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
- unsigned long max_addr = TASK_SIZE;
- if (vma->vm_next)
- max_addr = vma->vm_next->vm_start;
+ if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
- if (max_addr - addr >= new_len) {
+ if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
- vma_adjust(vma, vma->vm_start,
- addr + new_len, vma->vm_pgoff, NULL);
+ if (vma_adjust(vma, vma->vm_start, addr + new_len,
+ vma->vm_pgoff, NULL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
- mm->total_vm += pages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
- make_pages_present(addr + old_len,
- addr + new_len);
+ locked = true;
+ new_addr = addr;
}
ret = addr;
goto out;
@@ -382,34 +562,26 @@ unsigned long do_mremap(unsigned long addr,
*/
ret = -ENOMEM;
if (flags & MREMAP_MAYMOVE) {
- if (!(flags & MREMAP_FIXED)) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff, map_flags);
+ unsigned long map_flags = 0;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+ vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (new_addr & ~PAGE_MASK) {
ret = new_addr;
- if (new_addr & ~PAGE_MASK)
- goto out;
+ goto out;
}
- ret = move_vma(vma, addr, old_len, new_len, new_addr);
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
-out_nc:
- return ret;
-}
-
-asmlinkage unsigned long sys_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr)
-{
- unsigned long ret;
-
- down_write(&current->mm->mmap_sem);
- ret = do_mremap(addr, old_len, new_len, flags, new_addr);
up_write(&current->mm->mmap_sem);
+ if (locked && new_len > old_len)
+ mm_populate(new_addr + old_len, new_len - old_len);
return ret;
}
diff --git a/mm/msync.c b/mm/msync.c
index 358d73cf7b7..992a1673d48 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,6 +12,7 @@
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/sched.h>
/*
* MS_SYNC syncs the entire file - including mappings.
@@ -27,7 +28,7 @@
* So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications.
*/
-asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
{
unsigned long end;
struct mm_struct *mm = current->mm;
@@ -57,6 +58,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
vma = find_vma(mm, start);
for (;;) {
struct file *file;
+ loff_t fstart, fend;
/* Still start < end. */
error = -ENOMEM;
@@ -76,12 +78,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
goto out_unlock;
}
file = vma->vm_file;
+ fstart = (start - vma->vm_start) +
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ fend = fstart + (min(end, vma->vm_end) - start) - 1;
start = vma->vm_end;
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = do_fsync(file, 0);
+ if (vma->vm_flags & VM_NONLINEAR)
+ error = vfs_fsync(file, 1);
+ else
+ error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 00000000000..7ed58602e71
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,434 @@
+/*
+ * bootmem - A boot-time physical memory allocator and configurator
+ *
+ * Copyright (C) 1999 Ingo Molnar
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ void *ptr;
+ u64 addr;
+
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
+
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+ if (!addr)
+ return NULL;
+
+ if (memblock_reserve(addr, size))
+ return NULL;
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
+}
+
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int order;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1UL, __ffs(start));
+
+ while (start + (1UL << order) > end)
+ order--;
+
+ __free_pages_bootmem(pfn_to_page(start), order);
+
+ start += (1UL << order);
+ }
+}
+
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn > end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
+static unsigned long __init free_low_memory_core_early(void)
+{
+ unsigned long count = 0;
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ count += __free_memory_core(start, end);
+
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ {
+ phys_addr_t size;
+
+ /* Free memblock.reserved array if it was allocated */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+
+ /* Free memblock.memory array if it was allocated */
+ size = get_allocated_memblock_memory_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+ }
+#endif
+
+ return count;
+}
+
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ if (reset_managed_pages_done)
+ return;
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ z->managed_pages = 0;
+}
+
+void __init reset_all_zones_managed_pages(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+ reset_managed_pages_done = 1;
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ unsigned long pages;
+
+ reset_all_zones_managed_pages();
+
+ /*
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesn't have RAM installed
+ * low ram will be on Node1
+ */
+ pages = free_low_memory_core_early();
+ totalram_pages += pages;
+
+ return pages;
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
+{
+ memblock_free(physaddr, size);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+ memblock_free(addr, size);
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+ if (mem)
+ return mem;
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+again:
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, limit);
+ if (ptr)
+ return ptr;
+
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
+ goal, limit);
+ if (ptr)
+ return ptr;
+
+ if (goal) {
+ goal = 0;
+ goto again;
+ }
+
+ return NULL;
+}
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+
+static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+void * __init __alloc_bootmem_low_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem_nopanic(size, align, goal,
+ ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node(pgdat, size, align, goal,
+ ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 56454066219..4a852f6c570 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,13 +6,18 @@
*
* See Documentation/nommu-mmap.txt
*
- * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -20,84 +25,79 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/ptrace.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compiler.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/sched/sysctl.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include "internal.h"
+
+#if 0
+#define kenter(FMT, ...) \
+ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+ printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+ printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+#else
+#define kenter(FMT, ...) \
+ no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+ no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+ no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
+#endif
void *high_memory;
struct page *mem_map;
unsigned long max_mapnr;
-unsigned long num_physpages;
-unsigned long askedalloc, realalloc;
-atomic_t vm_committed_space = ATOMIC_INIT(0);
+unsigned long highest_memmap_pfn;
+struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
-EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(__vm_enough_memory);
-
-/* list of shareable VMAs */
-struct rb_root nommu_vma_tree = RB_ROOT;
-DECLARE_RWSEM(nommu_vma_sem);
-
-struct vm_operations_struct generic_file_vm_ops = {
-};
-
-EXPORT_SYMBOL(vfree);
-EXPORT_SYMBOL(vmalloc_to_page);
-EXPORT_SYMBOL(vmalloc_32);
-EXPORT_SYMBOL(vmap);
-EXPORT_SYMBOL(vunmap);
+atomic_long_t mmap_pages_allocated;
/*
- * Handle all mappings that got truncated by a "truncate()"
- * system call.
- *
- * NOTE! We have to be ready to update the memory sharing
- * between the file and the memory map for a potential last
- * incomplete page. Ugly, but necessary.
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
*/
-int vmtruncate(struct inode *inode, loff_t offset)
+unsigned long vm_memory_committed(void)
{
- struct address_space *mapping = inode->i_mapping;
- unsigned long limit;
+ return percpu_counter_read_positive(&vm_committed_as);
+}
- if (inode->i_size < offset)
- goto do_expand;
- i_size_write(inode, offset);
+EXPORT_SYMBOL_GPL(vm_memory_committed);
- truncate_inode_pages(mapping, offset);
- goto out_truncate;
+EXPORT_SYMBOL(mem_map);
-do_expand:
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (limit != RLIM_INFINITY && offset > limit)
- goto out_sig;
- if (offset > inode->i_sb->s_maxbytes)
- goto out;
- i_size_write(inode, offset);
+/* list of mapped, potentially shareable regions */
+static struct kmem_cache *vm_region_jar;
+struct rb_root nommu_region_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_region_sem);
-out_truncate:
- if (inode->i_op && inode->i_op->truncate)
- inode->i_op->truncate(inode);
- return 0;
-out_sig:
- send_sig(SIGXFSZ, current, 0);
-out:
- return -EFBIG;
-}
-
-EXPORT_SYMBOL(vmtruncate);
+const struct vm_operations_struct generic_file_vm_ops = {
+};
/*
* Return the total memory allocated for this pointer, not
@@ -109,46 +109,67 @@ unsigned int kobjsize(const void *objp)
{
struct page *page;
- if (!objp || !((page = virt_to_page(objp))))
+ /*
+ * If the object we have should not have ksize performed on it,
+ * return size of 0
+ */
+ if (!objp || !virt_addr_valid(objp))
return 0;
+ page = virt_to_head_page(objp);
+
+ /*
+ * If the allocator sets PageSlab, we know the pointer came from
+ * kmalloc().
+ */
if (PageSlab(page))
return ksize(objp);
- BUG_ON(page->index < 0);
- BUG_ON(page->index >= MAX_ORDER);
+ /*
+ * If it's not a compound page, see if we have a matching VMA
+ * region. This test is intentionally done in reverse order,
+ * so if there's no VMA, we still fall through and hand back
+ * PAGE_SIZE for 0-order pages.
+ */
+ if (!PageCompound(page)) {
+ struct vm_area_struct *vma;
- return (PAGE_SIZE << page->index);
+ vma = find_vma(current->mm, (unsigned long)objp);
+ if (vma)
+ return vma->vm_end - vma->vm_start;
+ }
+
+ /*
+ * The ksize() function is only guaranteed to work for pointers
+ * returned by kmalloc(). So handle arbitrary pointers here.
+ */
+ return PAGE_SIZE << compound_order(page);
}
-/*
- * get a list of pages in an address range belonging to the specified process
- * and indicate the VMA that covers each page
- * - this is potentially dodgy as we may end incrementing the page count of a
- * slab page or a secondary page from a compound page
- * - don't permit access to VMAs that don't support it, such as I/O mappings
- */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, int write, int force,
- struct page **pages, struct vm_area_struct **vmas)
+long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int foll_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *nonblocking)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
int i;
/* calculate required read or write permissions.
- * - if 'force' is set, we only require the "MAY" flags.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
*/
- vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ vm_flags = (foll_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
- for (i = 0; i < len; i++) {
+ for (i = 0; i < nr_pages; i++) {
vma = find_vma(mm, start);
if (!vma)
goto finish_or_fault;
/* protect what we can, including chardevs */
- if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
!(vm_flags & vma->vm_flags))
goto finish_or_fault;
@@ -159,7 +180,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
if (vmas)
vmas[i] = vma;
- start += PAGE_SIZE;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
}
return i;
@@ -168,37 +189,107 @@ finish_or_fault:
return i ? : -EFAULT;
}
+/*
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ * slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
+ */
+long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ int flags = 0;
+
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
+}
EXPORT_SYMBOL(get_user_pages);
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+ unsigned long *pfn)
+{
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+ *pfn = address >> PAGE_SHIFT;
+ return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
+
+LIST_HEAD(vmap_area_list);
-void vfree(void *addr)
+void vfree(const void *addr)
{
kfree(addr);
}
+EXPORT_SYMBOL(vfree);
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
/*
- * kmalloc doesn't like __GFP_HIGHMEM for some reason
+ * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
+ * returns only a logical address.
*/
return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
}
+EXPORT_SYMBOL(__vmalloc);
-struct page * vmalloc_to_page(void *addr)
+void *vmalloc_user(unsigned long size)
+{
+ void *ret;
+
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ if (ret) {
+ struct vm_area_struct *vma;
+
+ down_write(&current->mm->mmap_sem);
+ vma = find_vma(current->mm, (unsigned long)ret);
+ if (vma)
+ vma->vm_flags |= VM_USERMAP;
+ up_write(&current->mm->mmap_sem);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+
+struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
}
+EXPORT_SYMBOL(vmalloc_to_page);
-unsigned long vmalloc_to_pfn(void *addr)
+unsigned long vmalloc_to_pfn(const void *addr)
{
return page_to_pfn(virt_to_page(addr));
}
-
+EXPORT_SYMBOL(vmalloc_to_pfn);
long vread(char *buf, char *addr, unsigned long count)
{
+ /* Don't allow overflow */
+ if ((unsigned long) buf + count < count)
+ count = -(unsigned long) buf;
+
memcpy(buf, addr, count);
return count;
}
@@ -210,7 +301,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
- return(count);
+ return count;
}
/*
@@ -221,7 +312,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
* Allocate enough pages to cover @size from the page level
* allocator and map them into continguos kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
@@ -230,15 +321,83 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+/*
+ * vzalloc - allocate virtually continguos memory with zero fill
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into continguos kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
EXPORT_SYMBOL(vmalloc_node);
-/*
- * vmalloc_32 - allocate virtually continguos memory (32bit addressable)
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
*
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
+/**
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
+ *
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+
+void *vmalloc_exec(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
+
+/**
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
@@ -248,17 +407,98 @@ void *vmalloc_32(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
}
+EXPORT_SYMBOL(vmalloc_32);
+
+/**
+ * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
+ * @size: allocation size
+ *
+ * The resulting memory area is 32bit addressable and zeroed so it can be
+ * mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+ /*
+ * We'll have to sort out the ZONE_DMA bits for 64-bit,
+ * but for now this can simply use vmalloc_user() directly.
+ */
+ return vmalloc_user(size);
+}
+EXPORT_SYMBOL(vmalloc_32_user);
void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
{
BUG();
return NULL;
}
+EXPORT_SYMBOL(vmap);
+
+void vunmap(const void *addr)
+{
+ BUG();
+}
+EXPORT_SYMBOL(vunmap);
+
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+ BUG();
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+void vm_unmap_aliases(void)
+{
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+/*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void __weak vmalloc_sync_all(void)
+{
+}
+
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
-void vunmap(void *addr)
+void free_vm_area(struct vm_struct *area)
{
BUG();
}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+ struct page *page)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_insert_page);
/*
* sys_brk() for the most part doesn't need the global kernel
@@ -267,7 +507,7 @@ void vunmap(void *addr)
* to a regular file. in this case, the unmapping will need
* to invoke file system routines that need the global lock.
*/
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
{
struct mm_struct *mm = current->mm;
@@ -288,185 +528,382 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
/*
* Ok, looks good - let it rip.
*/
+ flush_icache_range(mm->brk, brk);
return mm->brk = brk;
}
-#ifdef DEBUG
-static void show_process_blocks(void)
+/*
+ * initialise the VMA and region record slabs
+ */
+void __init mmap_init(void)
{
- struct vm_list_struct *vml;
-
- printk("Process blocks %d:", current->pid);
+ int ret;
- for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
- printk(" %p: %p", vml, vml->vma);
- if (vml->vma)
- printk(" (%d @%lx #%d)",
- kobjsize((void *) vml->vma->vm_start),
- vml->vma->vm_start,
- atomic_read(&vml->vma->vm_usage));
- printk(vml->next ? " ->" : ".\n");
- }
+ ret = percpu_counter_init(&vm_committed_as, 0);
+ VM_BUG_ON(ret);
+ vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
}
-#endif /* DEBUG */
/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * - should be called with mm->mmap_sem held writelocked
+ * validate the region tree
+ * - the caller must hold the region lock
*/
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+#ifdef CONFIG_DEBUG_NOMMU_REGIONS
+static noinline void validate_nommu_regions(void)
{
- struct vm_list_struct **ppv;
+ struct vm_region *region, *last;
+ struct rb_node *p, *lastp;
- for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
- if ((*ppv)->vma->vm_start > vml->vma->vm_start)
- break;
+ lastp = rb_first(&nommu_region_tree);
+ if (!lastp)
+ return;
- vml->next = *ppv;
- *ppv = vml;
+ last = rb_entry(lastp, struct vm_region, vm_rb);
+ BUG_ON(unlikely(last->vm_end <= last->vm_start));
+ BUG_ON(unlikely(last->vm_top < last->vm_end));
+
+ while ((p = rb_next(lastp))) {
+ region = rb_entry(p, struct vm_region, vm_rb);
+ last = rb_entry(lastp, struct vm_region, vm_rb);
+
+ BUG_ON(unlikely(region->vm_end <= region->vm_start));
+ BUG_ON(unlikely(region->vm_top < region->vm_end));
+ BUG_ON(unlikely(region->vm_start < last->vm_top));
+
+ lastp = p;
+ }
}
+#else
+static void validate_nommu_regions(void)
+{
+}
+#endif
/*
- * look up the first VMA in which addr resides, NULL if none
- * - should be called with mm->mmap_sem at least held readlocked
+ * add a region into the global tree
*/
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static void add_nommu_region(struct vm_region *region)
{
- struct vm_list_struct *loop, *vml;
+ struct vm_region *pregion;
+ struct rb_node **p, *parent;
- /* search the vm_start ordered list */
- vml = NULL;
- for (loop = mm->context.vmlist; loop; loop = loop->next) {
- if (loop->vma->vm_start > addr)
- break;
- vml = loop;
+ validate_nommu_regions();
+
+ parent = NULL;
+ p = &nommu_region_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ pregion = rb_entry(parent, struct vm_region, vm_rb);
+ if (region->vm_start < pregion->vm_start)
+ p = &(*p)->rb_left;
+ else if (region->vm_start > pregion->vm_start)
+ p = &(*p)->rb_right;
+ else if (pregion == region)
+ return;
+ else
+ BUG();
}
- if (vml && vml->vma->vm_end > addr)
- return vml->vma;
+ rb_link_node(&region->vm_rb, parent, p);
+ rb_insert_color(&region->vm_rb, &nommu_region_tree);
- return NULL;
+ validate_nommu_regions();
}
-EXPORT_SYMBOL(find_vma);
/*
- * find a VMA
- * - we don't extend stack VMAs under NOMMU conditions
+ * delete a region from the global tree
*/
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+static void delete_nommu_region(struct vm_region *region)
{
- return find_vma(mm, addr);
+ BUG_ON(!nommu_region_tree.rb_node);
+
+ validate_nommu_regions();
+ rb_erase(&region->vm_rb, &nommu_region_tree);
+ validate_nommu_regions();
}
/*
- * look up the first VMA exactly that exactly matches addr
- * - should be called with mm->mmap_sem at least held readlocked
+ * free a contiguous series of pages
*/
-static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
- unsigned long addr)
+static void free_page_series(unsigned long from, unsigned long to)
{
- struct vm_list_struct *vml;
-
- /* search the vm_start ordered list */
- for (vml = mm->context.vmlist; vml; vml = vml->next) {
- if (vml->vma->vm_start == addr)
- return vml->vma;
- if (vml->vma->vm_start > addr)
- break;
+ for (; from < to; from += PAGE_SIZE) {
+ struct page *page = virt_to_page(from);
+
+ kdebug("- free %lx", from);
+ atomic_long_dec(&mmap_pages_allocated);
+ if (page_count(page) != 1)
+ kdebug("free page %p: refcount not one: %d",
+ page, page_count(page));
+ put_page(page);
}
-
- return NULL;
}
/*
- * find a VMA in the global tree
+ * release a reference to a region
+ * - the caller must hold the region semaphore for writing, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_top
+ * will equal vm_start
*/
-static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+static void __put_nommu_region(struct vm_region *region)
+ __releases(nommu_region_sem)
{
- struct vm_area_struct *vma;
- struct rb_node *n = nommu_vma_tree.rb_node;
+ kenter("%p{%d}", region, region->vm_usage);
- while (n) {
- vma = rb_entry(n, struct vm_area_struct, vm_rb);
+ BUG_ON(!nommu_region_tree.rb_node);
- if (start < vma->vm_start)
- n = n->rb_left;
- else if (start > vma->vm_start)
- n = n->rb_right;
- else
- return vma;
+ if (--region->vm_usage == 0) {
+ if (region->vm_top > region->vm_start)
+ delete_nommu_region(region);
+ up_write(&nommu_region_sem);
+
+ if (region->vm_file)
+ fput(region->vm_file);
+
+ /* IO memory and memory shared directly out of the pagecache
+ * from ramfs/tmpfs mustn't be released here */
+ if (region->vm_flags & VM_MAPPED_COPY) {
+ kdebug("free series");
+ free_page_series(region->vm_start, region->vm_top);
+ }
+ kmem_cache_free(vm_region_jar, region);
+ } else {
+ up_write(&nommu_region_sem);
}
+}
- return NULL;
+/*
+ * release a reference to a region
+ */
+static void put_nommu_region(struct vm_region *region)
+{
+ down_write(&nommu_region_sem);
+ __put_nommu_region(region);
}
/*
- * add a VMA in the global tree
+ * update protection on a vma
+ */
+static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
+{
+#ifdef CONFIG_MPU
+ struct mm_struct *mm = vma->vm_mm;
+ long start = vma->vm_start & PAGE_MASK;
+ while (start < vma->vm_end) {
+ protect_page(mm, start, flags);
+ start += PAGE_SIZE;
+ }
+ update_protections(mm);
+#endif
+}
+
+/*
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_sem held writelocked
*/
-static void add_nommu_vma(struct vm_area_struct *vma)
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma;
+ struct vm_area_struct *pvma, *prev;
struct address_space *mapping;
- struct rb_node **p = &nommu_vma_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p, *parent, *rb_prev;
+
+ kenter(",%p", vma);
+
+ BUG_ON(!vma->vm_region);
+
+ mm->map_count++;
+ vma->vm_mm = mm;
+
+ protect_vma(vma, vma->vm_flags);
/* add the VMA to the mapping */
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
- /* add the VMA to the master list */
+ /* add the VMA to the tree */
+ parent = rb_prev = NULL;
+ p = &mm->mm_rb.rb_node;
while (*p) {
parent = *p;
pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
- if (vma->vm_start < pvma->vm_start) {
+ /* sort by: start addr, end addr, VMA struct addr in that order
+ * (the latter is necessary as we may get identical VMAs) */
+ if (vma->vm_start < pvma->vm_start)
p = &(*p)->rb_left;
- }
else if (vma->vm_start > pvma->vm_start) {
+ rb_prev = parent;
p = &(*p)->rb_right;
- }
- else {
- /* mappings are at the same address - this can only
- * happen for shared-mem chardevs and shared file
- * mappings backed by ramfs/tmpfs */
- BUG_ON(!(pvma->vm_flags & VM_SHARED));
-
- if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma)
- p = &(*p)->rb_right;
- else
- BUG();
- }
+ } else if (vma->vm_end < pvma->vm_end)
+ p = &(*p)->rb_left;
+ else if (vma->vm_end > pvma->vm_end) {
+ rb_prev = parent;
+ p = &(*p)->rb_right;
+ } else if (vma < pvma)
+ p = &(*p)->rb_left;
+ else if (vma > pvma) {
+ rb_prev = parent;
+ p = &(*p)->rb_right;
+ } else
+ BUG();
}
rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+ rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+
+ /* add VMA to the VMA list also */
+ prev = NULL;
+ if (rb_prev)
+ prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+
+ __vma_link_list(mm, vma, prev, parent);
}
/*
- * delete a VMA from the global list
+ * delete a VMA from its owning mm_struct and address space
*/
-static void delete_nommu_vma(struct vm_area_struct *vma)
+static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
+ struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
+
+ kenter("%p", vma);
+
+ protect_vma(vma, 0);
+
+ mm->map_count--;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
+ mutex_unlock(&mapping->i_mmap_mutex);
+ }
+
+ /* remove from the MM's tree and list */
+ rb_erase(&vma->vm_rb, &mm->mm_rb);
+
+ if (vma->vm_prev)
+ vma->vm_prev->vm_next = vma->vm_next;
+ else
+ mm->mmap = vma->vm_next;
+
+ if (vma->vm_next)
+ vma->vm_next->vm_prev = vma->vm_prev;
+}
+
+/*
+ * destroy a VMA record
+ */
+static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ kenter("%p", vma);
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ put_nommu_region(vma->vm_region);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ /* check the cache first */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+ * resides) */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end > addr) {
+ vmacache_update(addr, vma);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+ return -ENOMEM;
+}
+
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len)
+{
+ struct vm_area_struct *vma;
+ unsigned long end = addr + len;
+
+ /* check the cache first */
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
+ return vma;
+
+ /* trawl the list (there may be multiple mappings in which addr
+ * resides) */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_start < addr)
+ continue;
+ if (vma->vm_start > addr)
+ return NULL;
+ if (vma->vm_end == end) {
+ vmacache_update(addr, vma);
+ return vma;
+ }
}
- /* remove from the master list */
- rb_erase(&vma->vm_rb, &nommu_vma_tree);
+ return NULL;
}
/*
@@ -481,12 +918,11 @@ static int validate_mmap_request(struct file *file,
unsigned long pgoff,
unsigned long *_capabilities)
{
- unsigned long capabilities;
- unsigned long reqprot = prot;
+ unsigned long capabilities, rlen;
int ret;
/* do the simple checks first */
- if (flags & MAP_FIXED || addr) {
+ if (flags & MAP_FIXED) {
printk(KERN_DEBUG
"%d: Can't do fixed-address/overlay mmap of RAM\n",
current->pid);
@@ -497,22 +933,24 @@ static int validate_mmap_request(struct file *file,
(flags & MAP_TYPE) != MAP_SHARED)
return -EINVAL;
- if (PAGE_ALIGN(len) == 0)
- return addr;
-
- if (len > TASK_SIZE)
+ if (!len)
return -EINVAL;
+ /* Careful about overflows.. */
+ rlen = PAGE_ALIGN(len);
+ if (!rlen || rlen > TASK_SIZE)
+ return -ENOMEM;
+
/* offset overflow? */
- if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EINVAL;
+ if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
+ return -EOVERFLOW;
if (file) {
/* validate file mapping requests */
struct address_space *mapping;
/* files must support mmap */
- if (!file->f_op || !file->f_op->mmap)
+ if (!file->f_op->mmap)
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -521,7 +959,7 @@ static int validate_mmap_request(struct file *file,
*/
mapping = file->f_mapping;
if (!mapping)
- mapping = file->f_dentry->d_inode->i_mapping;
+ mapping = file_inode(file)->i_mapping;
capabilities = 0;
if (mapping && mapping->backing_dev_info)
@@ -530,7 +968,7 @@ static int validate_mmap_request(struct file *file,
if (!capabilities) {
/* no explicit capabilities set, so assume some
* defaults */
- switch (file->f_dentry->d_inode->i_mode & S_IFMT) {
+ switch (file_inode(file)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
capabilities = BDI_CAP_MAP_COPY;
@@ -555,34 +993,29 @@ static int validate_mmap_request(struct file *file,
if (!file->f_op->read)
capabilities &= ~BDI_CAP_MAP_COPY;
+ /* The file shall have been opened with read permission. */
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+
if (flags & MAP_SHARED) {
/* do checks for writing, appending and locking */
if ((prot & PROT_WRITE) &&
!(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (IS_APPEND(file->f_dentry->d_inode) &&
+ if (IS_APPEND(file_inode(file)) &&
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file->f_dentry->d_inode))
+ if (locks_verify_locked(file))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
return -ENODEV;
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
- ) {
- printk("MAP_SHARED not completely supported on !MMU\n");
- return -EINVAL;
- }
-
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
- }
- else {
+ } else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -594,28 +1027,39 @@ static int validate_mmap_request(struct file *file,
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
+ ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
+ ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ ) {
+ capabilities &= ~BDI_CAP_MAP_DIRECT;
+ if (flags & MAP_SHARED) {
+ printk(KERN_WARNING
+ "MAP_SHARED not completely supported on !MMU\n");
+ return -EINVAL;
+ }
+ }
+ }
+
/* handle executable mappings and implied executable
* mappings */
- if (file->f_vfsmnt->mnt_flags & MNT_NOEXEC) {
+ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
- }
- else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
- }
- else if ((prot & PROT_READ) &&
+ } else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
- }
- else {
+ } else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
@@ -628,7 +1072,7 @@ static int validate_mmap_request(struct file *file,
}
/* allow the security API to have its say */
- ret = security_file_mmap(file, reqprot, prot, flags);
+ ret = security_mmap_addr(addr);
if (ret < 0)
return ret;
@@ -649,94 +1093,129 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
- vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* vm_flags |= mm->def_flags; */
if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
vm_flags |= VM_MAYSHARE;
- }
- else {
+ } else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
+ vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
if (flags & MAP_SHARED)
- vm_flags |= VM_MAYSHARE | VM_SHARED;
- else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0)
- vm_flags |= VM_MAYSHARE;
+ vm_flags |= VM_SHARED;
}
/* refuse to let anyone share private mappings with this process if
* it's being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
- if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+ if ((flags & MAP_PRIVATE) && current->ptrace)
vm_flags &= ~VM_MAYSHARE;
return vm_flags;
}
/*
- * set up a shared mapping on a file
+ * set up a shared mapping on a file (the driver or filesystem provides and
+ * pins the storage)
*/
-static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_shared_file(struct vm_area_struct *vma)
{
int ret;
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+ if (ret == 0) {
+ vma->vm_region->vm_top = vma->vm_region->vm_end;
+ return 0;
+ }
if (ret != -ENOSYS)
return ret;
- /* getting an ENOSYS error indicates that direct mmap isn't
- * possible (as opposed to tried but failed) so we'll fall
- * through to making a private copy of the data and mapping
- * that if we can */
+ /* getting -ENOSYS indicates that direct mmap isn't possible (as
+ * opposed to tried but failed) so we can only give a suitable error as
+ * it's not possible to make a private copy if MAP_SHARED was given */
return -ENODEV;
}
/*
* set up a private mapping or an anonymous shared mapping
*/
-static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_private(struct vm_area_struct *vma,
+ struct vm_region *region,
+ unsigned long len,
+ unsigned long capabilities)
{
+ struct page *pages;
+ unsigned long total, point, n;
void *base;
- int ret;
+ int ret, order;
/* invoke the file's mapping function so that it can keep track of
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (vma->vm_file) {
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
- if (ret != -ENOSYS) {
+ if (ret == 0) {
/* shouldn't return success if we're not sharing */
- BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
- return ret; /* success or a real error */
+ BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
+ vma->vm_region->vm_top = vma->vm_region->vm_end;
+ return 0;
}
+ if (ret != -ENOSYS)
+ return ret;
/* getting an ENOSYS error indicates that direct mmap isn't
* possible (as opposed to tried but failed) so we'll try to
* make a private copy of the data and map that instead */
}
+
/* allocate some memory to hold the mapping
* - note that this may not return a page-aligned address if the object
* we're allocating is smaller than a page
*/
- base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
- if (!base)
+ order = get_order(len);
+ kdebug("alloc order %d for %lx", order, len);
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ if (!pages)
goto enomem;
- vma->vm_start = (unsigned long) base;
- vma->vm_end = vma->vm_start + len;
- vma->vm_flags |= VM_MAPPED_COPY;
+ total = 1 << order;
+ atomic_long_add(total, &mmap_pages_allocated);
+
+ point = len >> PAGE_SHIFT;
+
+ /* we allocated a power-of-2 sized page set, so we may want to trim off
+ * the excess */
+ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+ while (total > point) {
+ order = ilog2(total - point);
+ n = 1 << order;
+ kdebug("shave %lu/%lu @%lu", n, total - point, total);
+ atomic_long_sub(n, &mmap_pages_allocated);
+ total -= n;
+ set_page_refcounted(pages + total);
+ __free_pages(pages + total, order);
+ }
+ }
-#ifdef WARN_ON_SLACK
- if (len + WARN_ON_SLACK <= kobjsize(result))
- printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
- len, current->pid, kobjsize(result) - len);
-#endif
+ for (point = 1; point < total; point++)
+ set_page_refcounted(&pages[point]);
+
+ base = page_address(pages);
+ region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+ region->vm_start = (unsigned long) base;
+ region->vm_end = region->vm_start + len;
+ region->vm_top = region->vm_start + (total << PAGE_SHIFT);
+
+ vma->vm_start = region->vm_start;
+ vma->vm_end = region->vm_start + len;
if (vma->vm_file) {
/* read the contents of a file into the copy */
@@ -758,22 +1237,21 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
if (ret < len)
memset(base + ret, 0, len - ret);
- } else {
- /* if it's an anonymous mapping, then just clear it */
- memset(base, 0, len);
}
return 0;
error_free:
- kfree(base);
- vma->vm_start = 0;
+ free_page_series(region->vm_start, region->vm_top);
+ region->vm_start = vma->vm_start = 0;
+ region->vm_end = vma->vm_end = 0;
+ region->vm_top = 0;
return ret;
enomem:
- printk("Allocation of length %lu from process %d failed\n",
- len, current->pid);
- show_free_areas();
+ pr_err("Allocation of length %lu from process %d (%s) failed\n",
+ len, current->pid, current->comm);
+ show_free_areas(0);
return -ENOMEM;
}
@@ -785,316 +1263,511 @@ unsigned long do_mmap_pgoff(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
- unsigned long pgoff)
+ unsigned long pgoff,
+ unsigned long *populate)
{
- struct vm_list_struct *vml = NULL;
- struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *vma;
+ struct vm_region *region;
struct rb_node *rb;
- unsigned long capabilities, vm_flags;
- void *result;
+ unsigned long capabilities, vm_flags, result;
int ret;
+ kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
+
+ *populate = 0;
+
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities);
- if (ret < 0)
+ if (ret < 0) {
+ kleave(" = %d [val]", ret);
return ret;
+ }
+
+ /* we ignore the address hint */
+ addr = 0;
+ len = PAGE_ALIGN(len);
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
- /* we're going to need to record the mapping if it works */
- vml = kmalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
- if (!vml)
- goto error_getting_vml;
- memset(vml, 0, sizeof(*vml));
+ /* we're going to need to record the mapping */
+ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
+ if (!region)
+ goto error_getting_region;
+
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ goto error_getting_vma;
- down_write(&nommu_vma_sem);
+ region->vm_usage = 1;
+ region->vm_flags = vm_flags;
+ region->vm_pgoff = pgoff;
+
+ INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma->vm_flags = vm_flags;
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ region->vm_file = get_file(file);
+ vma->vm_file = get_file(file);
+ }
- /* if we want to share, we need to check for VMAs created by other
+ down_write(&nommu_region_sem);
+
+ /* if we want to share, we need to check for regions created by other
* mmap() calls that overlap with our proposed mapping
- * - we can only share with an exact match on most regular files
+ * - we can only share with a superset match on most regular files
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) {
- unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long vmpglen;
+ struct vm_region *pregion;
+ unsigned long pglen, rpglen, pgend, rpgend, start;
+
+ pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgend = pgoff + pglen;
- for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
- vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+ for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
+ pregion = rb_entry(rb, struct vm_region, vm_rb);
- if (!(vma->vm_flags & VM_MAYSHARE))
+ if (!(pregion->vm_flags & VM_MAYSHARE))
continue;
/* search for overlapping mappings on the same file */
- if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
+ if (file_inode(pregion->vm_file) !=
+ file_inode(file))
continue;
- if (vma->vm_pgoff >= pgoff + pglen)
+ if (pregion->vm_pgoff >= pgend)
continue;
- vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
- vmpglen >>= PAGE_SHIFT;
- if (pgoff >= vma->vm_pgoff + vmpglen)
+ rpglen = pregion->vm_end - pregion->vm_start;
+ rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ rpgend = pregion->vm_pgoff + rpglen;
+ if (pgoff >= rpgend)
continue;
- /* handle inexactly overlapping matches between mappings */
- if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+ /* handle inexactly overlapping matches between
+ * mappings */
+ if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
+ !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
+ /* new mapping is not a subset of the region */
if (!(capabilities & BDI_CAP_MAP_DIRECT))
goto sharing_violation;
continue;
}
- /* we've found a VMA we can share */
- atomic_inc(&vma->vm_usage);
-
- vml->vma = vma;
- result = (void *) vma->vm_start;
- goto shared;
+ /* we've found a region we can share */
+ pregion->vm_usage++;
+ vma->vm_region = pregion;
+ start = pregion->vm_start;
+ start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
+ vma->vm_start = start;
+ vma->vm_end = start + len;
+
+ if (pregion->vm_flags & VM_MAPPED_COPY) {
+ kdebug("share copy");
+ vma->vm_flags |= VM_MAPPED_COPY;
+ } else {
+ kdebug("share mmap");
+ ret = do_mmap_shared_file(vma);
+ if (ret < 0) {
+ vma->vm_region = NULL;
+ vma->vm_start = 0;
+ vma->vm_end = 0;
+ pregion->vm_usage--;
+ pregion = NULL;
+ goto error_just_free;
+ }
+ }
+ fput(region->vm_file);
+ kmem_cache_free(vm_region_jar, region);
+ region = pregion;
+ result = start;
+ goto share;
}
- vma = NULL;
-
/* obtain the address at which to make a shared mapping
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (file && file->f_op->get_unmapped_area) {
+ if (capabilities & BDI_CAP_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
- if (IS_ERR((void *) addr)) {
+ if (IS_ERR_VALUE(addr)) {
ret = addr;
- if (ret != (unsigned long) -ENOSYS)
- goto error;
+ if (ret != -ENOSYS)
+ goto error_just_free;
/* the driver refused to tell us where to site
* the mapping so we'll have to attempt to copy
* it */
- ret = (unsigned long) -ENODEV;
+ ret = -ENODEV;
if (!(capabilities & BDI_CAP_MAP_COPY))
- goto error;
+ goto error_just_free;
capabilities &= ~BDI_CAP_MAP_DIRECT;
+ } else {
+ vma->vm_start = region->vm_start = addr;
+ vma->vm_end = region->vm_end = addr + len;
}
}
}
- /* we're going to need a VMA struct as well */
- vma = kmalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
- if (!vma)
- goto error_getting_vma;
-
- memset(vma, 0, sizeof(*vma));
- INIT_LIST_HEAD(&vma->anon_vma_node);
- atomic_set(&vma->vm_usage, 1);
- if (file)
- get_file(file);
- vma->vm_file = file;
- vma->vm_flags = vm_flags;
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
-
- vml->vma = vma;
+ vma->vm_region = region;
- /* set up the mapping */
+ /* set up the mapping
+ * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ */
if (file && vma->vm_flags & VM_SHARED)
- ret = do_mmap_shared_file(vma, len);
+ ret = do_mmap_shared_file(vma);
else
- ret = do_mmap_private(vma, len);
+ ret = do_mmap_private(vma, region, len, capabilities);
if (ret < 0)
- goto error;
+ goto error_just_free;
+ add_nommu_region(region);
- /* okay... we have a mapping; now we have to register it */
- result = (void *) vma->vm_start;
+ /* clear anonymous mappings that don't ask for uninitialized data */
+ if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ memset((void *)region->vm_start, 0,
+ region->vm_end - region->vm_start);
- if (vma->vm_flags & VM_MAPPED_COPY) {
- realalloc += kobjsize(result);
- askedalloc += len;
- }
-
- realalloc += kobjsize(vma);
- askedalloc += sizeof(*vma);
+ /* okay... we have a mapping; now we have to register it */
+ result = vma->vm_start;
current->mm->total_vm += len >> PAGE_SHIFT;
- add_nommu_vma(vma);
-
- shared:
- realalloc += kobjsize(vml);
- askedalloc += sizeof(*vml);
+share:
+ add_vma_to_mm(current->mm, vma);
- add_vma_to_mm(current->mm, vml);
-
- up_write(&nommu_vma_sem);
+ /* we flush the region from the icache only when the first executable
+ * mapping of it is made */
+ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
+ flush_icache_range(region->vm_start, region->vm_end);
+ region->vm_icache_flushed = true;
+ }
- if (prot & PROT_EXEC)
- flush_icache_range((unsigned long) result,
- (unsigned long) result + len);
+ up_write(&nommu_region_sem);
-#ifdef DEBUG
- printk("do_mmap:\n");
- show_process_blocks();
-#endif
+ kleave(" = %lx", result);
+ return result;
- return (unsigned long) result;
-
- error:
- up_write(&nommu_vma_sem);
- kfree(vml);
- if (vma) {
+error_just_free:
+ up_write(&nommu_region_sem);
+error:
+ if (region->vm_file)
+ fput(region->vm_file);
+ kmem_cache_free(vm_region_jar, region);
+ if (vma->vm_file)
fput(vma->vm_file);
- kfree(vma);
- }
+ kmem_cache_free(vm_area_cachep, vma);
+ kleave(" = %d", ret);
return ret;
- sharing_violation:
- up_write(&nommu_vma_sem);
- printk("Attempt to share mismatched mappings\n");
- kfree(vml);
- return -EINVAL;
+sharing_violation:
+ up_write(&nommu_region_sem);
+ printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+ ret = -EINVAL;
+ goto error;
- error_getting_vma:
- up_write(&nommu_vma_sem);
- kfree(vml);
- printk("Allocation of vma for %lu byte allocation from process %d failed\n",
+error_getting_vma:
+ kmem_cache_free(vm_region_jar, region);
+ printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
+ " from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
- error_getting_vml:
- printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+error_getting_region:
+ printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
+ " from process %d failed\n",
len, current->pid);
- show_free_areas();
+ show_free_areas(0);
return -ENOMEM;
}
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file *file = NULL;
+ unsigned long retval = -EBADF;
+
+ audit_mmap_fd(fd, flags);
+ if (!(flags & MAP_ANONYMOUS)) {
+ file = fget(fd);
+ if (!file)
+ goto out;
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long prot;
+ unsigned long flags;
+ unsigned long fd;
+ unsigned long offset;
+};
+
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+ struct mmap_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
+
/*
- * handle mapping disposal for uClinux
+ * split a vma into two pieces at address 'addr', a new vma is allocated either
+ * for the first part or the tail.
*/
-static void put_vma(struct vm_area_struct *vma)
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
{
- if (vma) {
- down_write(&nommu_vma_sem);
+ struct vm_area_struct *new;
+ struct vm_region *region;
+ unsigned long npages;
- if (atomic_dec_and_test(&vma->vm_usage)) {
- delete_nommu_vma(vma);
+ kenter("");
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ /* we're only permitted to split anonymous regions (these should have
+ * only a single usage on the region) */
+ if (vma->vm_file)
+ return -ENOMEM;
- /* IO memory and memory shared directly out of the pagecache from
- * ramfs/tmpfs mustn't be released here */
- if (vma->vm_flags & VM_MAPPED_COPY) {
- realalloc -= kobjsize((void *) vma->vm_start);
- askedalloc -= vma->vm_end - vma->vm_start;
- kfree((void *) vma->vm_start);
- }
+ if (mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
- realalloc -= kobjsize(vma);
- askedalloc -= sizeof(*vma);
+ region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
- if (vma->vm_file)
- fput(vma->vm_file);
- kfree(vma);
- }
+ new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!new) {
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
+ }
+
+ /* most fields are the same, copy all, and then fixup */
+ *new = *vma;
+ *region = *vma->vm_region;
+ new->vm_region = region;
+
+ npages = (addr - vma->vm_start) >> PAGE_SHIFT;
- up_write(&nommu_vma_sem);
+ if (new_below) {
+ region->vm_top = region->vm_end = new->vm_end = addr;
+ } else {
+ region->vm_start = new->vm_start = addr;
+ region->vm_pgoff = new->vm_pgoff += npages;
}
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+ delete_vma_from_mm(vma);
+ down_write(&nommu_region_sem);
+ delete_nommu_region(vma->vm_region);
+ if (new_below) {
+ vma->vm_region->vm_start = vma->vm_start = addr;
+ vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
+ } else {
+ vma->vm_region->vm_end = vma->vm_end = addr;
+ vma->vm_region->vm_top = addr;
+ }
+ add_nommu_region(vma->vm_region);
+ add_nommu_region(new->vm_region);
+ up_write(&nommu_region_sem);
+ add_vma_to_mm(mm, vma);
+ add_vma_to_mm(mm, new);
+ return 0;
}
/*
- * release a mapping
- * - under NOMMU conditions the parameters must match exactly to the mapping to
- * be removed
+ * shrink a VMA by removing the specified chunk from either the beginning or
+ * the end
*/
-int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+static int shrink_vma(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long from, unsigned long to)
{
- struct vm_list_struct *vml, **parent;
- unsigned long end = addr + len;
+ struct vm_region *region;
-#ifdef DEBUG
- printk("do_munmap:\n");
-#endif
+ kenter("");
- for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
- if ((*parent)->vma->vm_start > addr)
- break;
- if ((*parent)->vma->vm_start == addr &&
- ((len == 0) || ((*parent)->vma->vm_end == end)))
- goto found;
+ /* adjust the VMA's pointers, which may reposition it in the MM's tree
+ * and list */
+ delete_vma_from_mm(vma);
+ if (from > vma->vm_start)
+ vma->vm_end = from;
+ else
+ vma->vm_start = to;
+ add_vma_to_mm(mm, vma);
+
+ /* cut the backing region down to size */
+ region = vma->vm_region;
+ BUG_ON(region->vm_usage != 1);
+
+ down_write(&nommu_region_sem);
+ delete_nommu_region(region);
+ if (from > region->vm_start) {
+ to = region->vm_top;
+ region->vm_top = region->vm_end = from;
+ } else {
+ region->vm_start = to;
}
+ add_nommu_region(region);
+ up_write(&nommu_region_sem);
- printk("munmap of non-mmaped memory by process %d (%s): %p\n",
- current->pid, current->comm, (void *) addr);
- return -EINVAL;
+ free_page_series(from, to);
+ return 0;
+}
- found:
- vml = *parent;
+/*
+ * release a mapping
+ * - under NOMMU conditions the chunk to be unmapped must be backed by a single
+ * VMA, though it need not cover the whole VMA
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+ struct vm_area_struct *vma;
+ unsigned long end;
+ int ret;
- put_vma(vml->vma);
+ kenter(",%lx,%zx", start, len);
- *parent = vml->next;
- realalloc -= kobjsize(vml);
- askedalloc -= sizeof(*vml);
- kfree(vml);
+ len = PAGE_ALIGN(len);
+ if (len == 0)
+ return -EINVAL;
- update_hiwater_vm(mm);
- mm->total_vm -= len >> PAGE_SHIFT;
+ end = start + len;
+
+ /* find the first potentially overlapping VMA */
+ vma = find_vma(mm, start);
+ if (!vma) {
+ static int limit;
+ if (limit < 5) {
+ printk(KERN_WARNING
+ "munmap of memory not mmapped by process %d"
+ " (%s): 0x%lx-0x%lx\n",
+ current->pid, current->comm,
+ start, start + len - 1);
+ limit++;
+ }
+ return -EINVAL;
+ }
-#ifdef DEBUG
- show_process_blocks();
-#endif
+ /* we're allowed to split an anonymous VMA but not a file-backed one */
+ if (vma->vm_file) {
+ do {
+ if (start > vma->vm_start) {
+ kleave(" = -EINVAL [miss]");
+ return -EINVAL;
+ }
+ if (end == vma->vm_end)
+ goto erase_whole_vma;
+ vma = vma->vm_next;
+ } while (vma);
+ kleave(" = -EINVAL [split file]");
+ return -EINVAL;
+ } else {
+ /* the chunk must be a subset of the VMA found */
+ if (start == vma->vm_start && end == vma->vm_end)
+ goto erase_whole_vma;
+ if (start < vma->vm_start || end > vma->vm_end) {
+ kleave(" = -EINVAL [superset]");
+ return -EINVAL;
+ }
+ if (start & ~PAGE_MASK) {
+ kleave(" = -EINVAL [unaligned start]");
+ return -EINVAL;
+ }
+ if (end != vma->vm_end && end & ~PAGE_MASK) {
+ kleave(" = -EINVAL [unaligned split]");
+ return -EINVAL;
+ }
+ if (start != vma->vm_start && end != vma->vm_end) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret < 0) {
+ kleave(" = %d [split]", ret);
+ return ret;
+ }
+ }
+ return shrink_vma(mm, vma, start, end);
+ }
+erase_whole_vma:
+ delete_vma_from_mm(vma);
+ delete_vma(mm, vma);
+ kleave(" = 0");
return 0;
}
+EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+int vm_munmap(unsigned long addr, size_t len)
{
- int ret;
struct mm_struct *mm = current->mm;
+ int ret;
down_write(&mm->mmap_sem);
ret = do_munmap(mm, addr, len);
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL(vm_munmap);
+
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+ return vm_munmap(addr, len);
+}
/*
- * Release all mappings
+ * release all the mappings made in a process's VM space
*/
-void exit_mmap(struct mm_struct * mm)
+void exit_mmap(struct mm_struct *mm)
{
- struct vm_list_struct *tmp;
+ struct vm_area_struct *vma;
- if (mm) {
-#ifdef DEBUG
- printk("Exit_mmap:\n");
-#endif
+ if (!mm)
+ return;
- mm->total_vm = 0;
+ kenter("");
- while ((tmp = mm->context.vmlist)) {
- mm->context.vmlist = tmp->next;
- put_vma(tmp->vma);
+ mm->total_vm = 0;
- realalloc -= kobjsize(tmp);
- askedalloc -= sizeof(*tmp);
- kfree(tmp);
- }
-
-#ifdef DEBUG
- show_process_blocks();
-#endif
+ while ((vma = mm->mmap)) {
+ mm->mmap = vma->vm_next;
+ delete_vma_from_mm(vma);
+ delete_vma(mm, vma);
+ cond_resched();
}
+
+ kleave("");
}
-unsigned long do_brk(unsigned long addr, unsigned long len)
+unsigned long vm_brk(unsigned long addr, unsigned long len)
{
return -ENOMEM;
}
@@ -1104,25 +1777,30 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
* time (controlled by the MREMAP_MAYMOVE flag and available VM space)
*
* under NOMMU conditions, we only permit changing a mapping's size, and only
- * as long as it stays within the hole allocated by the kmalloc() call in
- * do_mmap_pgoff() and the block is not shareable
+ * as long as it stays within the region allocated by do_mmap_private() and the
+ * block is not shareable
*
* MREMAP_FIXED is not supported under NOMMU conditions
*/
-unsigned long do_mremap(unsigned long addr,
+static unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr)
{
struct vm_area_struct *vma;
/* insanity checks first */
- if (new_len == 0)
+ old_len = PAGE_ALIGN(old_len);
+ new_len = PAGE_ALIGN(new_len);
+ if (old_len == 0 || new_len == 0)
return (unsigned long) -EINVAL;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+
if (flags & MREMAP_FIXED && new_addr != addr)
return (unsigned long) -EINVAL;
- vma = find_vma_exact(current->mm, addr);
+ vma = find_vma_exact(current->mm, addr, old_len);
if (!vma)
return (unsigned long) -EINVAL;
@@ -1132,21 +1810,17 @@ unsigned long do_mremap(unsigned long addr,
if (vma->vm_flags & VM_MAYSHARE)
return (unsigned long) -EPERM;
- if (new_len > kobjsize((void *) addr))
+ if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
return (unsigned long) -ENOMEM;
/* all checks complete - do it */
vma->vm_end = vma->vm_start + new_len;
-
- askedalloc -= old_len;
- askedalloc += new_len;
-
return vma->vm_start;
}
-asmlinkage unsigned long sys_mremap(unsigned long addr,
- unsigned long old_len, unsigned long new_len,
- unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+ unsigned long, new_len, unsigned long, flags,
+ unsigned long, new_addr)
{
unsigned long ret;
@@ -1156,32 +1830,54 @@ asmlinkage unsigned long sys_mremap(unsigned long addr,
return ret;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags)
+struct page *follow_page_mask(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
+ *page_mask = 0;
return NULL;
}
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
- unsigned long to, unsigned long size, pgprot_t prot)
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
- vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
-void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
+ unsigned long pfn = start >> PAGE_SHIFT;
+ unsigned long vm_len = vma->vm_end - vma->vm_start;
+
+ pfn += vma->vm_pgoff;
+ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
+EXPORT_SYMBOL(vm_iomap_memory);
-unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+ unsigned long pgoff)
{
- return -ENOMEM;
+ unsigned int size = vma->vm_end - vma->vm_start;
+
+ if (!(vma->vm_flags & VM_USERMAP))
+ return -EINVAL;
+
+ vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+ vma->vm_end = vma->vm_start + size;
+
+ return 0;
}
+EXPORT_SYMBOL(remap_vmalloc_range);
-void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
+unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
{
+ return -ENOMEM;
}
void unmap_mapping_range(struct address_space *mapping,
@@ -1207,9 +1903,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -1220,10 +1916,18 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
- free = global_page_state(NR_FILE_PAGES);
- free += nr_swap_pages;
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
/*
* Any slabs which are created with the
@@ -1234,34 +1938,18 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -1269,57 +1957,60 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
goto error;
}
- allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+ allowed = vm_commit_limit();
/*
- * Leave the last 3% for root
+ * Reserve some 3% for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
- allowed += total_swap_pages;
-
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
+ * Don't let a single process grow so big a user can't recover
*/
- if (atomic_read(&vm_committed_space) < (long)allowed)
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
+
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
return 0;
}
-struct page *filemap_nopage(struct vm_area_struct *area,
- unsigned long address, int *type)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
- return NULL;
+ return 0;
}
+EXPORT_SYMBOL(filemap_fault);
-/*
- * Access another process' address space.
- * - source/target buffer must be kernel space
- */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct vm_area_struct *vma;
- struct mm_struct *mm;
+ BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
- if (addr + len < addr)
- return 0;
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
+{
+ struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
@@ -1332,9 +2023,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
/* only read or write mappings where it is permitted */
if (write && vma->vm_flags & VM_MAYWRITE)
- len -= copy_to_user((void *) addr, buf, len);
+ copy_to_user_page(vma, NULL, addr,
+ (void *) addr, buf, len);
else if (!write && vma->vm_flags & VM_MAYREAD)
- len -= copy_from_user(buf, (void *) addr, len);
+ copy_from_user_page(vma, NULL, addr,
+ buf, (void *) addr, len);
else
len = 0;
} else {
@@ -1342,6 +2035,148 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
up_read(&mm->mmap_sem);
+
+ return len;
+}
+
+/**
+ * @access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ len = __access_remote_vm(tsk, mm, addr, buf, len, write);
+
mmput(mm);
return len;
}
+
+/**
+ * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
+ * @inode: The inode to check
+ * @size: The current filesize of the inode
+ * @newsize: The proposed filesize of the inode
+ *
+ * Check the shared mappings on an inode on behalf of a shrinking truncate to
+ * make sure that that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * automatically grant mappings that are too large.
+ */
+int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
+ size_t newsize)
+{
+ struct vm_area_struct *vma;
+ struct vm_region *region;
+ pgoff_t low, high;
+ size_t r_size, r_top;
+
+ low = newsize >> PAGE_SHIFT;
+ high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ down_write(&nommu_region_sem);
+ mutex_lock(&inode->i_mapping->i_mmap_mutex);
+
+ /* search for VMAs that fall within the dead zone */
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
+ /* found one - only interested if it's shared out of the page
+ * cache */
+ if (vma->vm_flags & VM_SHARED) {
+ mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ up_write(&nommu_region_sem);
+ return -ETXTBSY; /* not quite true, but near enough */
+ }
+ }
+
+ /* reduce any regions that overlap the dead zone - if in existence,
+ * these will be pointed to by VMAs that don't overlap the dead zone
+ *
+ * we don't check for any regions that start beyond the EOF as there
+ * shouldn't be any
+ */
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+ 0, ULONG_MAX) {
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ region = vma->vm_region;
+ r_size = region->vm_top - region->vm_start;
+ r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
+
+ if (r_top > newsize) {
+ region->vm_top -= r_top - newsize;
+ if (region->vm_end > region->vm_top)
+ region->vm_end = region->vm_top;
+ }
+ }
+
+ mutex_unlock(&inode->i_mapping->i_mmap_mutex);
+ up_write(&nommu_region_sem);
+ return 0;
+}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119..3291e82d435 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
+ * Copyright (C) 2010 Google, Inc.
+ * Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
@@ -15,335 +17,527 @@
* kernel subsystems and hints as to where to find out what things do.
*/
+#include <linux/oom.h>
#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/notifier.h>
+#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
+#include <linux/security.h>
+#include <linux/ptrace.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
+#include <linux/ratelimit.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
int sysctl_panic_on_oom;
-/* #define DEBUG */
+int sysctl_oom_kill_allocating_task;
+int sysctl_oom_dump_tasks = 1;
+static DEFINE_SPINLOCK(zone_scan_lock);
+#ifdef CONFIG_NUMA
/**
- * badness - calculate a numeric value for how bad this task has been
- * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @start: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
*
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- * algorithm has been meticulously tuned to meet the principle
- * of least surprise ... (be careful when you change it)
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
*/
+static bool has_intersects_mems_allowed(struct task_struct *start,
+ const nodemask_t *mask)
+{
+ struct task_struct *tsk;
+ bool ret = false;
+
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
+ if (mask) {
+ /*
+ * If this is a mempolicy constrained oom, tsk's
+ * cpuset is irrelevant. Only return true if its
+ * mempolicy intersects current, otherwise it may be
+ * needlessly killed.
+ */
+ ret = mempolicy_nodemask_intersects(tsk, mask);
+ } else {
+ /*
+ * This is not a mempolicy constrained oom, so only
+ * check the mems of tsk's cpuset.
+ */
+ ret = cpuset_mems_allowed_intersects(current, tsk);
+ }
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+ return ret;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+ const nodemask_t *mask)
{
- unsigned long points, cpu_time, run_time, s;
- struct mm_struct *mm;
- struct task_struct *child;
+ return true;
+}
+#endif /* CONFIG_NUMA */
- task_lock(p);
- mm = p->mm;
- if (!mm) {
- task_unlock(p);
- return 0;
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer. Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
+ task_lock(t);
+ if (likely(t->mm))
+ goto found;
+ task_unlock(t);
}
+ t = NULL;
+found:
+ rcu_read_unlock();
- /*
- * swapoff can easily use up all memory, so kill those first.
- */
- if (p->flags & PF_SWAPOFF)
- return ULONG_MAX;
+ return t;
+}
- /*
- * The memory size of the process is the basis for the badness.
- */
- points = mm->total_vm;
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p,
+ const struct mem_cgroup *memcg, const nodemask_t *nodemask)
+{
+ if (is_global_init(p))
+ return true;
+ if (p->flags & PF_KTHREAD)
+ return true;
- /*
- * After this unlock we can no longer dereference local variable `mm'
- */
- task_unlock(p);
+ /* When mem_cgroup_out_of_memory() and p is not member of the group */
+ if (memcg && !task_in_mem_cgroup(p, memcg))
+ return true;
- /*
- * Processes which fork a lot of child processes are likely
- * a good choice. We add half the vmsize of the children if they
- * have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children. In case a single
- * child is eating the vast majority of memory, adding only half
- * to the parents will make the child our kill candidate of choice.
- */
- list_for_each_entry(child, &p->children, sibling) {
- task_lock(child);
- if (child->mm != mm && child->mm)
- points += child->mm->total_vm/2 + 1;
- task_unlock(child);
- }
+ /* p may not have freeable memory in nodemask */
+ if (!has_intersects_mems_allowed(p, nodemask))
+ return true;
- /*
- * CPU time is in tens of seconds and run time is in thousands
- * of seconds. There is no particular reason for this other than
- * that it turned out to work very well in practice.
- */
- cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
- >> (SHIFT_HZ + 3);
+ return false;
+}
- if (uptime >= p->start_time.tv_sec)
- run_time = (uptime - p->start_time.tv_sec) >> 10;
- else
- run_time = 0;
+/**
+ * oom_badness - heuristic function to determine which candidate task to kill
+ * @p: task struct of which task we should calculate
+ * @totalpages: total present RAM allowed for page allocation
+ *
+ * The heuristic for determining which task to kill is made to be as simple and
+ * predictable as possible. The goal is to return the highest value for the
+ * task consuming the most memory to avoid subsequent oom failures.
+ */
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+ const nodemask_t *nodemask, unsigned long totalpages)
+{
+ long points;
+ long adj;
+
+ if (oom_unkillable_task(p, memcg, nodemask))
+ return 0;
- s = int_sqrt(cpu_time);
- if (s)
- points /= s;
- s = int_sqrt(int_sqrt(run_time));
- if (s)
- points /= s;
+ p = find_lock_task_mm(p);
+ if (!p)
+ return 0;
+
+ adj = (long)p->signal->oom_score_adj;
+ if (adj == OOM_SCORE_ADJ_MIN) {
+ task_unlock(p);
+ return 0;
+ }
/*
- * Niced processes are most likely less important, so double
- * their badness points.
+ * The baseline for the badness score is the proportion of RAM that each
+ * task's rss, pagetable and swap space use.
*/
- if (task_nice(p) > 0)
- points *= 2;
+ points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
+ get_mm_counter(p->mm, MM_SWAPENTS);
+ task_unlock(p);
/*
- * Superuser processes are usually more important, so we make it
- * less likely that we kill those.
+ * Root processes get 3% bonus, just like the __vm_enough_memory()
+ * implementation used by LSMs.
*/
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
- p->uid == 0 || p->euid == 0)
- points /= 4;
+ if (has_capability_noaudit(p, CAP_SYS_ADMIN))
+ points -= (points * 3) / 100;
+
+ /* Normalize to oom_score_adj units */
+ adj *= totalpages / 1000;
+ points += adj;
/*
- * We don't want to kill a process with direct hardware access.
- * Not only could that mess up the hardware, but usually users
- * tend to only have this flag set on applications they think
- * of as important.
+ * Never return 0 for an eligible task regardless of the root bonus and
+ * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
- points /= 4;
+ return points > 0 ? points : 1;
+}
+/*
+ * Determine the type of allocation constraint.
+ */
+#ifdef CONFIG_NUMA
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
+{
+ struct zone *zone;
+ struct zoneref *z;
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ bool cpuset_limited = false;
+ int nid;
+
+ /* Default to all available memory */
+ *totalpages = totalram_pages + total_swap_pages;
+
+ if (!zonelist)
+ return CONSTRAINT_NONE;
/*
- * If p's nodes don't overlap ours, it may still help to kill p
- * because p may have allocated or otherwise mapped memory on
- * this node before. However it will be less likely.
+ * Reach here only when __GFP_NOFAIL is used. So, we should avoid
+ * to kill current.We have to random task kill in this case.
+ * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (!cpuset_excl_nodes_overlap(p))
- points /= 8;
+ if (gfp_mask & __GFP_THISNODE)
+ return CONSTRAINT_NONE;
/*
- * Adjust the score by oomkilladj.
+ * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
+ * the page allocator means a mempolicy is in effect. Cpuset policy
+ * is enforced in get_page_from_freelist().
*/
- if (p->oomkilladj) {
- if (p->oomkilladj > 0)
- points <<= p->oomkilladj;
- else
- points >>= -(p->oomkilladj);
+ if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, *nodemask)
+ *totalpages += node_spanned_pages(nid);
+ return CONSTRAINT_MEMORY_POLICY;
}
-#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
- p->pid, p->comm, points);
-#endif
- return points;
+ /* Check this allocation failure is caused by cpuset's wall function */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask)
+ if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
+ cpuset_limited = true;
+
+ if (cpuset_limited) {
+ *totalpages = total_swap_pages;
+ for_each_node_mask(nid, cpuset_current_mems_allowed)
+ *totalpages += node_spanned_pages(nid);
+ return CONSTRAINT_CPUSET;
+ }
+ return CONSTRAINT_NONE;
}
+#else
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+ gfp_t gfp_mask, nodemask_t *nodemask,
+ unsigned long *totalpages)
+{
+ *totalpages = totalram_pages + total_swap_pages;
+ return CONSTRAINT_NONE;
+}
+#endif
-/*
- * Types of limitations to the nodes from which allocations may occur
- */
-#define CONSTRAINT_NONE 1
-#define CONSTRAINT_MEMORY_POLICY 2
-#define CONSTRAINT_CPUSET 3
-
-/*
- * Determine the type of allocation constraint.
- */
-static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+ unsigned long totalpages, const nodemask_t *nodemask,
+ bool force_kill)
{
-#ifdef CONFIG_NUMA
- struct zone **z;
- nodemask_t nodes = node_online_map;
+ if (task->exit_state)
+ return OOM_SCAN_CONTINUE;
+ if (oom_unkillable_task(task, NULL, nodemask))
+ return OOM_SCAN_CONTINUE;
- for (z = zonelist->zones; *z; z++)
- if (cpuset_zone_allowed(*z, gfp_mask))
- node_clear(zone_to_nid(*z), nodes);
- else
- return CONSTRAINT_CPUSET;
+ /*
+ * This task already has access to memory reserves and is being killed.
+ * Don't allow any other task to have access to the reserves.
+ */
+ if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+ if (unlikely(frozen(task)))
+ __thaw_task(task);
+ if (!force_kill)
+ return OOM_SCAN_ABORT;
+ }
+ if (!task->mm)
+ return OOM_SCAN_CONTINUE;
- if (!nodes_empty(nodes))
- return CONSTRAINT_MEMORY_POLICY;
-#endif
+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task))
+ return OOM_SCAN_SELECT;
- return CONSTRAINT_NONE;
+ if (task->flags & PF_EXITING && !force_kill) {
+ /*
+ * If this task is not being ptraced on exit, then wait for it
+ * to finish before killing some other task unnecessarily.
+ */
+ if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+ return OOM_SCAN_ABORT;
+ }
+ return OOM_SCAN_OK;
}
/*
* Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'. Returns -1 on scan abort.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned int *ppoints,
+ unsigned long totalpages, const nodemask_t *nodemask,
+ bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
- struct timespec uptime;
- *ppoints = 0;
+ unsigned long chosen_points = 0;
- do_posix_clock_monotonic_gettime(&uptime);
- do_each_thread(g, p) {
- unsigned long points;
- int releasing;
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ unsigned int points;
- /* skip kernel threads */
- if (!p->mm)
+ switch (oom_scan_process_thread(p, totalpages, nodemask,
+ force_kill)) {
+ case OOM_SCAN_SELECT:
+ chosen = p;
+ chosen_points = ULONG_MAX;
+ /* fall through */
+ case OOM_SCAN_CONTINUE:
continue;
- /* skip the init task with pid == 1 */
- if (p->pid == 1)
+ case OOM_SCAN_ABORT:
+ rcu_read_unlock();
+ return (struct task_struct *)(-1UL);
+ case OOM_SCAN_OK:
+ break;
+ };
+ points = oom_badness(p, NULL, nodemask, totalpages);
+ if (!points || points < chosen_points)
continue;
-
- /*
- * This is in the process of releasing memory so wait for it
- * to finish before killing some other task by mistake.
- *
- * However, if p is the current task, we allow the 'kill' to
- * go ahead if it is exiting: this will simply set TIF_MEMDIE,
- * which will allow it to gain access to memory reserves in
- * the process of exiting and releasing its resources.
- * Otherwise we could get an OOM deadlock.
- */
- releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
- p->flags & PF_EXITING;
- if (releasing) {
- /* PF_DEAD tasks have already released their mm */
- if (p->flags & PF_DEAD)
- continue;
- if (p->flags & PF_EXITING && p == current) {
- chosen = p;
- *ppoints = ULONG_MAX;
- break;
- }
- return ERR_PTR(-1UL);
- }
- if (p->oomkilladj == OOM_DISABLE)
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points && thread_group_leader(chosen))
continue;
- points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
- chosen = p;
- *ppoints = points;
- }
- } while_each_thread(g, p);
+ chosen = p;
+ chosen_points = points;
+ }
+ if (chosen)
+ get_task_struct(chosen);
+ rcu_read_unlock();
+
+ *ppoints = chosen_points * 1000 / totalpages;
return chosen;
}
/**
- * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
- * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
- * set.
+ * dump_tasks - dump current memory state of all system tasks
+ * @memcg: current's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Dumps the current memory state of all eligible tasks. Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
+ * swapents, oom_score_adj value, and name.
*/
-static void __oom_kill_task(struct task_struct *p, const char *message)
+static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
- if (p->pid == 1) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill init!\n");
- return;
- }
+ struct task_struct *p;
+ struct task_struct *task;
- task_lock(p);
- if (!p->mm || p->mm == &init_mm) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task!\n");
- task_unlock(p);
- return;
- }
- task_unlock(p);
+ pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
+ rcu_read_lock();
+ for_each_process(p) {
+ if (oom_unkillable_task(p, memcg, nodemask))
+ continue;
- if (message) {
- printk(KERN_ERR "%s: Killed process %d (%s).\n",
- message, p->pid, p->comm);
- }
+ task = find_lock_task_mm(p);
+ if (!task) {
+ /*
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
+ */
+ continue;
+ }
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- p->time_slice = HZ;
- set_tsk_thread_flag(p, TIF_MEMDIE);
+ pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ atomic_long_read(&task->mm->nr_ptes),
+ get_mm_counter(task->mm, MM_SWAPENTS),
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+ }
+ rcu_read_unlock();
+}
- force_sig(SIGKILL, p);
+static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
+ struct mem_cgroup *memcg, const nodemask_t *nodemask)
+{
+ task_lock(current);
+ pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+ "oom_score_adj=%hd\n",
+ current->comm, gfp_mask, order,
+ current->signal->oom_score_adj);
+ cpuset_print_task_mems_allowed(current);
+ task_unlock(current);
+ dump_stack();
+ if (memcg)
+ mem_cgroup_print_oom_info(memcg, p);
+ else
+ show_mem(SHOW_MEM_FILTER_NODES);
+ if (sysctl_oom_dump_tasks)
+ dump_tasks(memcg, nodemask);
}
-static int oom_kill_task(struct task_struct *p, const char *message)
+#define K(x) ((x) << (PAGE_SHIFT-10))
+/*
+ * Must be called while holding a reference to p, which will be released upon
+ * returning.
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *memcg, nodemask_t *nodemask,
+ const char *message)
{
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
struct mm_struct *mm;
- struct task_struct *g, *q;
-
- mm = p->mm;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
- *
- * Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
- * However, this is of no concern to us.
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just set TIF_MEMDIE so it can die quickly
*/
+ if (p->flags & PF_EXITING) {
+ set_tsk_thread_flag(p, TIF_MEMDIE);
+ put_task_struct(p);
+ return;
+ }
- if (mm == NULL || mm == &init_mm)
- return 1;
+ if (__ratelimit(&oom_rs))
+ dump_header(p, gfp_mask, order, memcg, nodemask);
+
+ task_lock(p);
+ pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+ task_unlock(p);
- __oom_kill_task(p, message);
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
*/
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q, message);
- while_each_thread(g, q);
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
- return 0;
-}
+ if (child->mm == p->mm)
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child, memcg, nodemask,
+ totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
-static int oom_kill_process(struct task_struct *p, unsigned long points,
- const char *message)
-{
- struct task_struct *c;
- struct list_head *tsk;
+ p = find_lock_task_mm(victim);
+ if (!p) {
+ put_task_struct(victim);
+ return;
+ } else if (victim != p) {
+ get_task_struct(p);
+ put_task_struct(victim);
+ victim = p;
+ }
+
+ /* mm cannot safely be dereferenced after task_unlock(victim) */
+ mm = victim->mm;
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+ K(get_mm_counter(victim->mm, MM_ANONPAGES)),
+ K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+ task_unlock(victim);
/*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just set TIF_MEMDIE so it can die quickly
+ * Kill all user processes sharing victim->mm in other thread groups, if
+ * any. They don't get access to memory reserves, though, to avoid
+ * depletion of all memory. This prevents mm->mmap_sem livelock when an
+ * oom killed thread cannot exit because it requires the semaphore and
+ * its contended by another thread trying to allocate memory itself.
+ * That thread will now get access to memory reserves since it has a
+ * pending fatal signal.
*/
- if (p->flags & PF_EXITING) {
- __oom_kill_task(p, NULL);
- return 0;
- }
+ rcu_read_lock();
+ for_each_process(p)
+ if (p->mm == mm && !same_thread_group(p, victim) &&
+ !(p->flags & PF_KTHREAD)) {
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
- printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li"
- " and children.\n", p->pid, p->comm, points);
- /* Try to kill a child first */
- list_for_each(tsk, &p->children) {
- c = list_entry(tsk, struct task_struct, sibling);
- if (c->mm == p->mm)
- continue;
- if (!oom_kill_task(c, message))
- return 0;
+ task_lock(p); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(p), p->comm);
+ task_unlock(p);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ }
+ rcu_read_unlock();
+
+ set_tsk_thread_flag(victim, TIF_MEMDIE);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ put_task_struct(victim);
+}
+#undef K
+
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+ int order, const nodemask_t *nodemask)
+{
+ if (likely(!sysctl_panic_on_oom))
+ return;
+ if (sysctl_panic_on_oom != 2) {
+ /*
+ * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+ * does not panic for cpuset, mempolicy, or memcg allocation
+ * failures.
+ */
+ if (constraint != CONSTRAINT_NONE)
+ return;
}
- return oom_kill_task(p, message);
+ dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ panic("Out of memory: %s panic_on_oom is enabled\n",
+ sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
@@ -360,85 +554,149 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
+ */
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ int ret = 1;
+
+ spin_lock(&zone_scan_lock);
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ if (zone_is_oom_locked(zone)) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ /*
+ * Lock each zone in the zonelist under zone_scan_lock so a
+ * parallel invocation of try_set_zonelist_oom() doesn't succeed
+ * when it shouldn't.
+ */
+ zone_set_flag(zone, ZONE_OOM_LOCKED);
+ }
+
+out:
+ spin_unlock(&zone_scan_lock);
+ return ret;
+}
+
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ spin_lock(&zone_scan_lock);
+ for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+ zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ }
+ spin_unlock(&zone_scan_lock);
+}
+
/**
* out_of_memory - kill the "best" process when we run out of memory
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
{
+ const nodemask_t *mpol_mask;
struct task_struct *p;
- unsigned long points = 0;
+ unsigned long totalpages;
unsigned long freed = 0;
+ unsigned int uninitialized_var(points);
+ enum oom_constraint constraint = CONSTRAINT_NONE;
+ int killed = 0;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return;
- if (printk_ratelimit()) {
- printk(KERN_WARNING "%s invoked oom-killer: "
- "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
- current->comm, gfp_mask, order, current->oomkilladj);
- dump_stack();
- show_mem();
+ /*
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
+ */
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
+ set_thread_flag(TIF_MEMDIE);
+ return;
}
- cpuset_lock();
- read_lock(&tasklist_lock);
-
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- switch (constrained_alloc(zonelist, gfp_mask)) {
- case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, points,
- "No available memory (MPOL_BIND)");
- break;
-
- case CONSTRAINT_CPUSET:
- oom_kill_process(current, points,
- "No available memory in cpuset");
- break;
-
- case CONSTRAINT_NONE:
- if (sysctl_panic_on_oom)
- panic("out of memory. panic_on_oom is selected\n");
-retry:
- /*
- * Rambo mode: Shoot down a process and hope it solves whatever
- * issues we may have.
- */
- p = select_bad_process(&points);
-
- if (PTR_ERR(p) == -1UL)
- goto out;
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- cpuset_unlock();
- panic("Out of memory and no killable processes...\n");
- }
-
- if (oom_kill_process(p, points, "Out of memory"))
- goto retry;
-
- break;
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+ &totalpages);
+ mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+
+ if (sysctl_oom_kill_allocating_task && current->mm &&
+ !oom_unkillable_task(current, NULL, nodemask) &&
+ current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(current);
+ oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
+ nodemask,
+ "Out of memory (oom_kill_allocating_task)");
+ goto out;
}
+ p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ panic("Out of memory and no killable processes...\n");
+ }
+ if (p != (void *)-1UL) {
+ oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+ nodemask, "Out of memory");
+ killed = 1;
+ }
out:
- read_unlock(&tasklist_lock);
- cpuset_unlock();
-
/*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
+ * Give the killed threads a good chance of exiting before trying to
+ * allocate memory again.
*/
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ if (killed)
+ schedule_timeout_killable(1);
+}
+
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
+ * parallel oom killing is already in progress so do nothing.
+ */
+void pagefault_out_of_memory(void)
+{
+ struct zonelist *zonelist;
+
+ if (mem_cgroup_oom_synchronize(true))
+ return;
+
+ zonelist = node_zonelist(first_online_node, GFP_KERNEL);
+ if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
+ out_of_memory(NULL, 0, 0, NULL, false);
+ clear_zonelist_oom(zonelist, GFP_KERNEL);
+ }
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 555752907dc..e0c943014eb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1,17 +1,18 @@
/*
- * mm/page-writeback.c.
+ * mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Contains functions related to writing back dirty pages at the
* address_space level.
*
- * 10Apr2002 akpm@zip.com.au
+ * 10Apr2002 Andrew Morton
* Initial version
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -21,6 +22,7 @@
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
@@ -30,15 +32,32 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
+#include <linux/pagevec.h>
+#include <linux/timer.h>
+#include <linux/sched/rt.h>
+#include <linux/mm_inline.h>
+#include <trace/events/writeback.h>
+
+#include "internal.h"
+
+/*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE max(HZ/5, 1)
/*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation. We do this so we don't hold I_LOCK against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
+ * Try to keep balance_dirty_pages() call intervals higher than this many pages
+ * by raising pause time to max_pause when falls below it.
*/
-#define MAX_WRITEBACK_PAGES 1024
+#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
+
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL max(HZ/5, 1)
+
+#define RATELIMIT_CALC_SHIFT 10
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -46,41 +65,47 @@
*/
static long ratelimit_pages = 32;
-static long total_pages; /* The total number of pages in the machine. */
-static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
+/* The following parameters are exported via /proc/sys/vm */
/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
- * large amounts of I/O are submitted.
+ * Start background writeback (via writeback threads) at this percentage
*/
-static inline long sync_writeback_pages(void)
-{
- return ratelimit_pages + ratelimit_pages / 2;
-}
+int dirty_background_ratio = 10;
-/* The following parameters are exported via /proc/sys/vm */
+/*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
/*
- * Start background writeback (via pdflush) at this percentage
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
*/
-int dirty_background_ratio = 10;
+int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
-int vm_dirty_ratio = 40;
+int vm_dirty_ratio = 20;
/*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
*/
-int dirty_writeback_interval = 5 * HZ;
+unsigned long vm_dirty_bytes;
/*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The interval between `kupdate'-style writebacks
*/
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+
+EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+
+/*
+ * The longest time for which data is allowed to remain dirty
+ */
+unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
/*
* Flag that makes the machine dump writes/reads and block dirtyings.
@@ -97,65 +122,165 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-
-static void background_writeout(unsigned long _min_pages);
+unsigned long global_dirty_limit;
/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
*
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around. To avoid stressing page reclaim with lots of unreclaimable
- * pages. It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct fprop_global writeout_completions;
+
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
*
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
*
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
*
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
*/
-static void
-get_dirty_limits(long *pbackground, long *pdirty,
- struct address_space *mapping)
+static unsigned long zone_dirtyable_memory(struct zone *zone)
{
- int background_ratio; /* Percentages */
- int dirty_ratio;
- int unmapped_ratio;
- long background;
- long dirty;
- unsigned long available_memory = total_pages;
- struct task_struct *tsk;
+ unsigned long nr_pages;
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_node_state(node, N_HIGH_MEMORY) {
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_dirtyable_memory(z);
+ }
+ /*
+ * Unreclaimable memory (kernel memory or anonymous memory
+ * without swap) can bring down the dirtyable pages below
+ * the zone's dirty balance reserve and the above calculation
+ * will underflow. However we still want to add in nodes
+ * which are below threshold (negative values) to get a more
+ * accurate calculation but make sure that the total never
+ * underflows.
+ */
+ if ((long)x < 0)
+ x = 0;
+
/*
- * If this mapping can only allocate from low memory,
- * we exclude high memory from our count.
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
*/
- if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
- available_memory -= totalhigh_pages;
+ return min(x, total);
+#else
+ return 0;
#endif
+}
+/**
+ * global_dirtyable_memory - number of globally dirtyable pages
+ *
+ * Returns the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
+ */
+static unsigned long global_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_page_state(NR_FREE_PAGES);
+ x -= min(x, dirty_balance_reserve);
- unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
- global_page_state(NR_ANON_PAGES)) * 100) /
- total_pages;
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
- dirty_ratio = vm_dirty_ratio;
- if (dirty_ratio > unmapped_ratio / 2)
- dirty_ratio = unmapped_ratio / 2;
+ if (!vm_highmem_is_dirtyable)
+ x -= highmem_dirtyable_memory(x);
- if (dirty_ratio < 5)
- dirty_ratio = 5;
+ return x + 1; /* Ensure that we never return 0 */
+}
+
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ unsigned long background;
+ unsigned long dirty;
+ unsigned long uninitialized_var(available_memory);
+ struct task_struct *tsk;
- background_ratio = dirty_background_ratio;
- if (background_ratio >= dirty_ratio)
- background_ratio = dirty_ratio / 2;
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = global_dirtyable_memory();
- background = (background_ratio * available_memory) / 100;
- dirty = (dirty_ratio * available_memory) / 100;
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
+
+ if (background >= dirty)
+ background = dirty / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
@@ -163,73 +288,1246 @@ get_dirty_limits(long *pbackground, long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+ unsigned long zone_memory = zone_dirtyable_memory(zone);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ zone_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * zone_memory / 100;
+
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long limit = zone_dirty_limit(zone);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_bytes = 0;
+ return ret;
+}
+
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ dirty_background_ratio = 0;
+ return ret;
+}
+
+int dirty_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_ratio = vm_dirty_ratio;
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ writeback_set_ratelimit();
+ vm_dirty_bytes = 0;
+ }
+ return ret;
+}
+
+int dirty_bytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned long old_bytes = vm_dirty_bytes;
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+ writeback_set_ratelimit();
+ vm_dirty_ratio = 0;
+ }
+ return ret;
+}
+
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+ cur_time += VM_COMPLETIONS_PERIOD_LEN;
+ /* 0 has a special meaning... */
+ if (!cur_time)
+ return 1;
+ return cur_time;
+}
+
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ __inc_bdi_stat(bdi, BDI_WRITTEN);
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
+ bdi->max_prop_frac);
+ /* First event after period switching was turned off? */
+ if (!unlikely(writeout_period_time)) {
+ /*
+ * We can race with other __bdi_writeout_inc calls here but
+ * it does not cause any harm since the resulting time when
+ * timer will fire and what is in writeout_period_time will be
+ * roughly the same.
+ */
+ writeout_period_time = wp_next_time(jiffies);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ }
+}
+
+void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bdi_writeout_inc(bdi);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+ long *numerator, long *denominator)
+{
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
+ numerator, denominator);
+}
+
+/*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+ int miss_periods = (jiffies - writeout_period_time) /
+ VM_COMPLETIONS_PERIOD_LEN;
+
+ if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+ writeout_period_time = wp_next_time(writeout_period_time +
+ miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ writeout_period_time = 0;
+ }
+}
+
+/*
+ * bdi_min_ratio keeps the sum of the minimum dirty shares of all
+ * registered backing devices, which, for obvious reasons, can not
+ * exceed 100%.
+ */
+static unsigned int bdi_min_ratio;
+
+int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
+{
+ int ret = 0;
+
+ spin_lock_bh(&bdi_lock);
+ if (min_ratio > bdi->max_ratio) {
+ ret = -EINVAL;
+ } else {
+ min_ratio -= bdi->min_ratio;
+ if (bdi_min_ratio + min_ratio < 100) {
+ bdi_min_ratio += min_ratio;
+ bdi->min_ratio += min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+
+int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
+{
+ int ret = 0;
+
+ if (max_ratio > 100)
+ return -EINVAL;
+
+ spin_lock_bh(&bdi_lock);
+ if (bdi->min_ratio > max_ratio) {
+ ret = -EINVAL;
+ } else {
+ bdi->max_ratio = max_ratio;
+ bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_ratio);
+
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+ unsigned long bg_thresh)
+{
+ return (thresh + bg_thresh) / 2;
+}
+
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+ return max(thresh, global_dirty_limit);
+}
+
+/**
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ *
+ * Note that balance_dirty_pages() will only seriously take it as a hard limit
+ * when sleeping max_pause per page is not enough to keep the dirty pages under
+ * control. For example, when the device is completely stalled due to some error
+ * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * In the other normal situations, it acts more gently by throttling the tasks
+ * more (rather than completely block them) when the bdi dirty pages go high.
+ *
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
+
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
+}
+
+/*
+ * setpoint - dirty 3
+ * f(dirty) := 1.0 + (----------------)
+ * limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit) = 0 => the hard limit
+ * (4) df/dx <= 0 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ * => fast response on large errors; small oscillation near setpoint
+ */
+static long long pos_ratio_polynom(unsigned long setpoint,
+ unsigned long dirty,
+ unsigned long limit)
+{
+ long long pos_ratio;
+ long x;
+
+ x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ limit - setpoint + 1);
+ pos_ratio = x;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+ pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+ return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
+}
+
+/*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/bdi setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ * pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ * if (dirty < setpoint) scale up pos_ratio
+ * if (dirty > setpoint) scale down pos_ratio
+ *
+ * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
+ * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *
+ * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ * ^ pos_ratio
+ * |
+ * | |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ * | .*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1.0 ................................*
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * | . . *
+ * 0 +------------.------------------.----------------------*------------->
+ * freerun^ setpoint^ limit^ dirty pages
+ *
+ * (o) bdi control line
+ *
+ * ^ pos_ratio
+ * |
+ * | *
+ * | *
+ * | *
+ * | *
+ * | * |<=========== span ============>|
+ * 1.0 .......................*
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * | . *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ * | . .
+ * | . .
+ * | . .
+ * 0 +----------------------.-------------------------------.------------->
+ * bdi_setpoint^ x_intercept^
+ *
+ * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ * card's bdi_dirty may rush to many times higher than bdi_setpoint.
+ * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty)
+{
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long x_intercept;
+ unsigned long setpoint; /* dirty pages' target balance point */
+ unsigned long bdi_setpoint;
+ unsigned long span;
+ long long pos_ratio; /* for scaling up/down the rate limit */
+ long x;
+
+ if (unlikely(dirty >= limit))
+ return 0;
+
+ /*
+ * global setpoint
+ *
+ * See comment for pos_ratio_polynom().
+ */
+ setpoint = (freerun + limit) / 2;
+ pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+
+ /*
+ * The strictlimit feature is a tool preventing mistrusted filesystems
+ * from growing a large number of dirty pages before throttling. For
+ * such filesystems balance_dirty_pages always checks bdi counters
+ * against bdi limits. Even if global "nr_dirty" is under "freerun".
+ * This is especially important for fuse which sets bdi->max_ratio to
+ * 1% by default. Without strictlimit feature, fuse writeback may
+ * consume arbitrary amount of RAM because it is accounted in
+ * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+ *
+ * Here, in bdi_position_ratio(), we calculate pos_ratio based on
+ * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+ * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
+ * limits are set by default to 10% and 20% (background and throttle).
+ * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+ * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
+ * about ~6K pages (as the average of background and throttle bdi
+ * limits). The 3rd order polynomial will provide positive feedback if
+ * bdi_dirty is under bdi_setpoint and vice versa.
+ *
+ * Note, that we cannot use global counters in these calculations
+ * because we want to throttle process writing to a strictlimit BDI
+ * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
+ * in the example above).
+ */
+ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ long long bdi_pos_ratio;
+ unsigned long bdi_bg_thresh;
+
+ if (bdi_dirty < 8)
+ return min_t(long long, pos_ratio * 2,
+ 2 << RATELIMIT_CALC_SHIFT);
+
+ if (bdi_dirty >= bdi_thresh)
+ return 0;
+
+ bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+ bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
+ bdi_bg_thresh);
+
+ if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
+ return 0;
+
+ bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
+ bdi_thresh);
+
+ /*
+ * Typically, for strictlimit case, bdi_setpoint << setpoint
+ * and pos_ratio >> bdi_pos_ratio. In the other words global
+ * state ("dirty") is not limiting factor and we have to
+ * make decision based on bdi counters. But there is an
+ * important case when global pos_ratio should get precedence:
+ * global limits are exceeded (e.g. due to activities on other
+ * BDIs) while given strictlimit BDI is below limit.
+ *
+ * "pos_ratio * bdi_pos_ratio" would work for the case above,
+ * but it would look too non-natural for the case of all
+ * activity in the system coming from a single strictlimit BDI
+ * with bdi->max_ratio == 100%.
+ *
+ * Note that min() below somewhat changes the dynamics of the
+ * control system. Normally, pos_ratio value can be well over 3
+ * (when globally we are at freerun and bdi is well below bdi
+ * setpoint). Now the maximum pos_ratio in the same situation
+ * is 2. We might want to tweak this if we observe the control
+ * system is too slow to adapt.
+ */
+ return min(pos_ratio, bdi_pos_ratio);
+ }
+
+ /*
+ * We have computed basic pos_ratio above based on global situation. If
+ * the bdi is over/under its share of dirty pages, we want to scale
+ * pos_ratio further down/up. That is done by the following mechanism.
+ */
+
+ /*
+ * bdi setpoint
+ *
+ * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+ *
+ * x_intercept - bdi_dirty
+ * := --------------------------
+ * x_intercept - bdi_setpoint
+ *
+ * The main bdi control line is a linear function that subjects to
+ *
+ * (1) f(bdi_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single bdi case)
+ * or equally: x_intercept = bdi_setpoint + 8 * write_bw
+ *
+ * For single bdi case, the dirty pages are observed to fluctuate
+ * regularly within range
+ * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+ * for various filesystems, where (2) can yield in a reasonable 12.5%
+ * fluctuation range for pos_ratio.
+ *
+ * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+ * own size, so move the slope over accordingly and choose a slope that
+ * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+ */
+ if (unlikely(bdi_thresh > thresh))
+ bdi_thresh = thresh;
+ /*
+ * It's very possible that bdi_thresh is close to 0 not because the
+ * device is slow, but that it has remained inactive for long time.
+ * Honour such devices a reasonable good (hopefully IO efficient)
+ * threshold, so that the occasional writes won't be blocked and active
+ * writes can rampup the threshold quickly.
+ */
+ bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ /*
+ * scale global setpoint to bdi's:
+ * bdi_setpoint = setpoint * bdi_thresh / thresh
+ */
+ x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+ bdi_setpoint = setpoint * (u64)x >> 16;
+ /*
+ * Use span=(8*write_bw) in single bdi case as indicated by
+ * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ *
+ * bdi_thresh thresh - bdi_thresh
+ * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
+ * thresh thresh
+ */
+ span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = bdi_setpoint + span;
+
+ if (bdi_dirty < x_intercept - span / 4) {
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
+ x_intercept - bdi_setpoint + 1);
+ } else
+ pos_ratio /= 4;
+
+ /*
+ * bdi reserve area, safeguard against dirty pool underrun and disk idle
+ * It may push the desired control point of global dirty pages higher
+ * than setpoint.
+ */
+ x_intercept = bdi_thresh / 2;
+ if (bdi_dirty < x_intercept) {
+ if (bdi_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+ else
+ pos_ratio *= 8;
+ }
+
+ return pos_ratio;
+}
+
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+ unsigned long elapsed,
+ unsigned long written)
+{
+ const unsigned long period = roundup_pow_of_two(3 * HZ);
+ unsigned long avg = bdi->avg_write_bandwidth;
+ unsigned long old = bdi->write_bandwidth;
+ u64 bw;
+
+ /*
+ * bw = written * HZ / elapsed
+ *
+ * bw * elapsed + write_bandwidth * (period - elapsed)
+ * write_bandwidth = ---------------------------------------------------
+ * period
+ */
+ bw = written - bdi->written_stamp;
+ bw *= HZ;
+ if (unlikely(elapsed > period)) {
+ do_div(bw, elapsed);
+ avg = bw;
+ goto out;
+ }
+ bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw >>= ilog2(period);
+
+ /*
+ * one more level of smoothing, for filtering out sudden spikes
+ */
+ if (avg > old && old >= (unsigned long)bw)
+ avg -= (avg - old) >> 3;
+
+ if (avg < old && old <= (unsigned long)bw)
+ avg += (old - avg) >> 3;
+
+out:
+ bdi->write_bandwidth = bw;
+ bdi->avg_write_bandwidth = avg;
+}
+
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+ unsigned long limit = global_dirty_limit;
+
+ /*
+ * Follow up in one step.
+ */
+ if (limit < thresh) {
+ limit = thresh;
+ goto update;
+ }
+
+ /*
+ * Follow down slowly. Use the higher one as the target, because thresh
+ * may drop below dirty. This is exactly the reason to introduce
+ * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ */
+ thresh = max(thresh, dirty);
+ if (limit > thresh) {
+ limit -= (limit - thresh) >> 5;
+ goto update;
+ }
+ return;
+update:
+ global_dirty_limit = limit;
+}
+
+static void global_update_bandwidth(unsigned long thresh,
+ unsigned long dirty,
+ unsigned long now)
+{
+ static DEFINE_SPINLOCK(dirty_lock);
+ static unsigned long update_time;
+
+ /*
+ * check locklessly first to optimize away locking for the most time
+ */
+ if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ return;
+
+ spin_lock(&dirty_lock);
+ if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(thresh, dirty);
+ update_time = now;
+ }
+ spin_unlock(&dirty_lock);
+}
+
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+ unsigned long limit = hard_dirty_limit(thresh);
+ unsigned long setpoint = (freerun + limit) / 2;
+ unsigned long write_bw = bdi->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+ unsigned long dirty_rate;
+ unsigned long task_ratelimit;
+ unsigned long balanced_dirty_ratelimit;
+ unsigned long pos_ratio;
+ unsigned long step;
+ unsigned long x;
+
+ /*
+ * The dirty rate will match the writeout rate in long term, except
+ * when dirty pages are truncated by userspace or re-dirtied by FS.
+ */
+ dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+ pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty);
+ /*
+ * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+ */
+ task_ratelimit = (u64)dirty_ratelimit *
+ pos_ratio >> RATELIMIT_CALC_SHIFT;
+ task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+
+ /*
+ * A linear estimation of the "balanced" throttle rate. The theory is,
+ * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+ * dirty_rate will be measured to be (N * task_ratelimit). So the below
+ * formula will yield the balanced rate limit (write_bw / N).
+ *
+ * Note that the expanded form is not a pure rate feedback:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1)
+ * but also takes pos_ratio into account:
+ * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2)
+ *
+ * (1) is not realistic because pos_ratio also takes part in balancing
+ * the dirty rate. Consider the state
+ * pos_ratio = 0.5 (3)
+ * rate = 2 * (write_bw / N) (4)
+ * If (1) is used, it will stuck in that state! Because each dd will
+ * be throttled at
+ * task_ratelimit = pos_ratio * rate = (write_bw / N) (5)
+ * yielding
+ * dirty_rate = N * task_ratelimit = write_bw (6)
+ * put (6) into (1) we get
+ * rate_(i+1) = rate_(i) (7)
+ *
+ * So we end up using (2) to always keep
+ * rate_(i+1) ~= (write_bw / N) (8)
+ * regardless of the value of pos_ratio. As long as (8) is satisfied,
+ * pos_ratio is able to drive itself to 1.0, which is not only where
+ * the dirty count meet the setpoint, but also where the slope of
+ * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+ */
+ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+ dirty_rate | 1);
+ /*
+ * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
+ */
+ if (unlikely(balanced_dirty_ratelimit > write_bw))
+ balanced_dirty_ratelimit = write_bw;
+
+ /*
+ * We could safely do this and return immediately:
+ *
+ * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+ *
+ * However to get a more stable dirty_ratelimit, the below elaborated
+ * code makes use of task_ratelimit to filter out singular points and
+ * limit the step size.
+ *
+ * The below code essentially only uses the relative value of
+ *
+ * task_ratelimit - dirty_ratelimit
+ * = (pos_ratio - 1) * dirty_ratelimit
+ *
+ * which reflects the direction and size of dirty position error.
+ */
+
+ /*
+ * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+ * task_ratelimit is on the same side of dirty_ratelimit, too.
+ * For example, when
+ * - dirty_ratelimit > balanced_dirty_ratelimit
+ * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+ * lowering dirty_ratelimit will help meet both the position and rate
+ * control targets. Otherwise, don't update dirty_ratelimit if it will
+ * only help meet the rate target. After all, what the users ultimately
+ * feel and care are stable dirty rate and small position error.
+ *
+ * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+ * and filter out the singular points of balanced_dirty_ratelimit. Which
+ * keeps jumping around randomly and can even leap far away at times
+ * due to the small 200ms estimation period of dirty_rate (we want to
+ * keep that period small to reduce time lags).
+ */
+ step = 0;
+
+ /*
+ * For strictlimit case, calculations above were based on bdi counters
+ * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+ * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
+ * Hence, to calculate "step" properly, we have to use bdi_dirty as
+ * "dirty" and bdi_setpoint as "setpoint".
+ *
+ * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
+ * it's possible that bdi_thresh is close to zero due to inactivity
+ * of backing device (see the implementation of bdi_dirty_limit()).
+ */
+ if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ dirty = bdi_dirty;
+ if (bdi_dirty < 8)
+ setpoint = bdi_dirty + 1;
+ else
+ setpoint = (bdi_thresh +
+ bdi_dirty_limit(bdi, bg_thresh)) / 2;
+ }
+
+ if (dirty < setpoint) {
+ x = min(bdi->balanced_dirty_ratelimit,
+ min(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit < x)
+ step = x - dirty_ratelimit;
+ } else {
+ x = max(bdi->balanced_dirty_ratelimit,
+ max(balanced_dirty_ratelimit, task_ratelimit));
+ if (dirty_ratelimit > x)
+ step = dirty_ratelimit - x;
+ }
+
+ /*
+ * Don't pursue 100% rate matching. It's impossible since the balanced
+ * rate itself is constantly fluctuating. So decrease the track speed
+ * when it gets close to the target. Helps eliminate pointless tremors.
+ */
+ step >>= dirty_ratelimit / (2 * step + 1);
+ /*
+ * Limit the tracking speed to avoid overshooting.
+ */
+ step = (step + 7) / 8;
+
+ if (dirty_ratelimit < balanced_dirty_ratelimit)
+ dirty_ratelimit += step;
+ else
+ dirty_ratelimit -= step;
+
+ bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+
+ trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ unsigned long now = jiffies;
+ unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long dirtied;
+ unsigned long written;
+
+ /*
+ * rate-limit, only update once every 200ms.
+ */
+ if (elapsed < BANDWIDTH_INTERVAL)
+ return;
+
+ dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
+ written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+ /*
+ * Skip quiet periods when disk bandwidth is under-utilized.
+ * (at least 1s idle time between two flusher runs)
+ */
+ if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ goto snapshot;
+
+ if (thresh) {
+ global_update_bandwidth(thresh, dirty, now);
+ bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty,
+ dirtied, elapsed);
+ }
+ bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+ bdi->dirtied_stamp = dirtied;
+ bdi->written_stamp = written;
+ bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+ unsigned long thresh,
+ unsigned long bg_thresh,
+ unsigned long dirty,
+ unsigned long bdi_thresh,
+ unsigned long bdi_dirty,
+ unsigned long start_time)
+{
+ if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+ return;
+ spin_lock(&bdi->wb.list_lock);
+ __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
+ bdi_thresh, bdi_dirty, start_time);
+ spin_unlock(&bdi->wb.list_lock);
+}
+
+/*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+ unsigned long thresh)
+{
+ if (thresh > dirty)
+ return 1UL << (ilog2(thresh - dirty) >> 1);
+
+ return 1;
+}
+
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
+{
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long t;
+
+ /*
+ * Limit pause time for small memory systems. If sleeping for too long
+ * time, a small pool of dirty/writeback pages may go empty and disk go
+ * idle.
+ *
+ * 8 serves as the safety ratio.
+ */
+ t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+ t++;
+
+ return min_t(unsigned long, t, MAX_PAUSE);
+}
+
+static long bdi_min_pause(struct backing_dev_info *bdi,
+ long max_pause,
+ unsigned long task_ratelimit,
+ unsigned long dirty_ratelimit,
+ int *nr_dirtied_pause)
+{
+ long hi = ilog2(bdi->avg_write_bandwidth);
+ long lo = ilog2(bdi->dirty_ratelimit);
+ long t; /* target pause */
+ long pause; /* estimated next pause */
+ int pages; /* target nr_dirtied_pause */
+
+ /* target for 10ms pause on 1-dd case */
+ t = max(1, HZ / 100);
+
+ /*
+ * Scale up pause time for concurrent dirtiers in order to reduce CPU
+ * overheads.
+ *
+ * (N * 10ms) on 2^N concurrent tasks.
+ */
+ if (hi > lo)
+ t += (hi - lo) * (10 * HZ) / 1024;
+
+ /*
+ * This is a bit convoluted. We try to base the next nr_dirtied_pause
+ * on the much more stable dirty_ratelimit. However the next pause time
+ * will be computed based on task_ratelimit and the two rate limits may
+ * depart considerably at some time. Especially if task_ratelimit goes
+ * below dirty_ratelimit/2 and the target pause is max_pause, the next
+ * pause time will be max_pause*2 _trimmed down_ to max_pause. As a
+ * result task_ratelimit won't be executed faithfully, which could
+ * eventually bring down dirty_ratelimit.
+ *
+ * We apply two rules to fix it up:
+ * 1) try to estimate the next pause time and if necessary, use a lower
+ * nr_dirtied_pause so as not to exceed max_pause. When this happens,
+ * nr_dirtied_pause will be "dancing" with task_ratelimit.
+ * 2) limit the target pause time to max_pause/2, so that the normal
+ * small fluctuations of task_ratelimit won't trigger rule (1) and
+ * nr_dirtied_pause will remain as stable as dirty_ratelimit.
+ */
+ t = min(t, 1 + max_pause / 2);
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+
+ /*
+ * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
+ * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
+ * When the 16 consecutive reads are often interrupted by some dirty
+ * throttling pause during the async writes, cfq will go into idles
+ * (deadline is fine). So push nr_dirtied_pause as high as possible
+ * until reaches DIRTY_POLL_THRESH=32 pages.
+ */
+ if (pages < DIRTY_POLL_THRESH) {
+ t = max_pause;
+ pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
+ if (pages > DIRTY_POLL_THRESH) {
+ pages = DIRTY_POLL_THRESH;
+ t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
+ }
+ }
+
+ pause = HZ * pages / (task_ratelimit + 1);
+ if (pause > max_pause) {
+ t = max_pause;
+ pages = task_ratelimit * t / roundup_pow_of_two(HZ);
+ }
+
+ *nr_dirtied_pause = pages;
+ /*
+ * The minimal pause time will normally be half the target pause time.
+ */
+ return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
+}
+
+static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
+ unsigned long dirty_thresh,
+ unsigned long background_thresh,
+ unsigned long *bdi_dirty,
+ unsigned long *bdi_thresh,
+ unsigned long *bdi_bg_thresh)
+{
+ unsigned long bdi_reclaimable;
+
+ /*
+ * bdi_thresh is not treated as some limiting factor as
+ * dirty_thresh, due to reasons
+ * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in a system with HDD and USB key, the USB key may somehow
+ * go into state (bdi_dirty >> bdi_thresh) either because
+ * bdi_dirty starts high, or because bdi_thresh drops low.
+ * In this case we don't want to hard throttle the USB key
+ * dirtiers for 100 seconds until bdi_dirty drops under
+ * bdi_thresh. Instead the auxiliary bdi control line in
+ * bdi_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ */
+ *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+ if (bdi_bg_thresh)
+ *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
+ background_thresh,
+ dirty_thresh) : 0;
+
+ /*
+ * In order to avoid the stacked BDI deadlock we need
+ * to ensure we accurately count the 'dirty' pages when
+ * the threshold is low.
+ *
+ * Otherwise it would be possible to get thresh+n pages
+ * reported dirty, even though there are thresh-m pages
+ * actually dirty; with m+n sitting in the percpu
+ * deltas.
+ */
+ if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
+ bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+ *bdi_dirty = bdi_reclaimable +
+ bdi_stat_sum(bdi, BDI_WRITEBACK);
+ } else {
+ bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+ *bdi_dirty = bdi_reclaimable +
+ bdi_stat(bdi, BDI_WRITEBACK);
+ }
}
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
*/
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+ unsigned long pages_dirtied)
{
- long nr_reclaimable;
- long background_thresh;
- long dirty_thresh;
- unsigned long pages_written = 0;
- unsigned long write_chunk = sync_writeback_pages();
-
+ unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ long period;
+ long pause;
+ long max_pause;
+ long min_pause;
+ int nr_dirtied_pause;
+ bool dirty_exceeded = false;
+ unsigned long task_ratelimit;
+ unsigned long dirty_ratelimit;
+ unsigned long pos_ratio;
struct backing_dev_info *bdi = mapping->backing_dev_info;
+ bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
+ unsigned long start_time = jiffies;
for (;;) {
- struct writeback_control wbc = {
- .bdi = bdi,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = write_chunk,
- .range_cyclic = 1,
- };
-
- get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
- dirty_thresh)
- break;
-
- if (!dirty_exceeded)
- dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ unsigned long now = jiffies;
+ unsigned long uninitialized_var(bdi_thresh);
+ unsigned long thresh;
+ unsigned long uninitialized_var(bdi_dirty);
+ unsigned long dirty;
+ unsigned long bg_thresh;
+
+ /*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
- if (nr_reclaimable) {
- writeback_inodes(&wbc);
- get_dirty_limits(&background_thresh,
- &dirty_thresh, mapping);
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- if (nr_reclaimable +
- global_page_state(NR_WRITEBACK)
- <= dirty_thresh)
- break;
- pages_written += write_chunk - wbc.nr_to_write;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+
+ if (unlikely(strictlimit)) {
+ bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+ &bdi_dirty, &bdi_thresh, &bg_thresh);
+
+ dirty = bdi_dirty;
+ thresh = bdi_thresh;
+ } else {
+ dirty = nr_dirty;
+ thresh = dirty_thresh;
+ bg_thresh = background_thresh;
}
- blk_congestion_wait(WRITE, HZ/10);
+
+ /*
+ * Throttle it only when the background writeback cannot
+ * catch-up. This avoids (excessively) small writeouts
+ * when the bdi limits are ramping up in case of !strictlimit.
+ *
+ * In strictlimit case make decision based on the bdi counters
+ * and limits. Small writeouts when the bdi limits are ramping
+ * up are the price we consciously pay for strictlimit-ing.
+ */
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ current->nr_dirtied_pause =
+ dirty_poll_interval(dirty, thresh);
+ break;
+ }
+
+ if (unlikely(!writeback_in_progress(bdi)))
+ bdi_start_background_writeback(bdi);
+
+ if (!strictlimit)
+ bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+ &bdi_dirty, &bdi_thresh, NULL);
+
+ dirty_exceeded = (bdi_dirty > bdi_thresh) &&
+ ((nr_dirty > dirty_thresh) || strictlimit);
+ if (dirty_exceeded && !bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
+ nr_dirty, bdi_thresh, bdi_dirty,
+ start_time);
+
+ dirty_ratelimit = bdi->dirty_ratelimit;
+ pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
+ background_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty);
+ task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+ RATELIMIT_CALC_SHIFT;
+ max_pause = bdi_max_pause(bdi, bdi_dirty);
+ min_pause = bdi_min_pause(bdi, max_pause,
+ task_ratelimit, dirty_ratelimit,
+ &nr_dirtied_pause);
+
+ if (unlikely(task_ratelimit == 0)) {
+ period = max_pause;
+ pause = max_pause;
+ goto pause;
+ }
+ period = HZ * pages_dirtied / task_ratelimit;
+ pause = period;
+ if (current->dirty_paused_when)
+ pause -= now - current->dirty_paused_when;
+ /*
+ * For less than 1s think time (ext3/4 may block the dirtier
+ * for up to 800ms from time to time on 1-HDD; so does xfs,
+ * however at much less frequency), try to compensate it in
+ * future periods by updating the virtual time; otherwise just
+ * do a reset, as it may be a light dirtier.
+ */
+ if (pause < min_pause) {
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ min(pause, 0L),
+ start_time);
+ if (pause < -HZ) {
+ current->dirty_paused_when = now;
+ current->nr_dirtied = 0;
+ } else if (period) {
+ current->dirty_paused_when += period;
+ current->nr_dirtied = 0;
+ } else if (current->nr_dirtied_pause <= pages_dirtied)
+ current->nr_dirtied_pause += pages_dirtied;
+ break;
+ }
+ if (unlikely(pause > max_pause)) {
+ /* for occasional dropped task_ratelimit */
+ now += min(pause - max_pause, max_pause);
+ pause = max_pause;
+ }
+
+pause:
+ trace_balance_dirty_pages(bdi,
+ dirty_thresh,
+ background_thresh,
+ nr_dirty,
+ bdi_thresh,
+ bdi_dirty,
+ dirty_ratelimit,
+ task_ratelimit,
+ pages_dirtied,
+ period,
+ pause,
+ start_time);
+ __set_current_state(TASK_KILLABLE);
+ io_schedule_timeout(pause);
+
+ current->dirty_paused_when = now + pause;
+ current->nr_dirtied = 0;
+ current->nr_dirtied_pause = nr_dirtied_pause;
+
+ /*
+ * This is typically equal to (nr_dirty < dirty_thresh) and can
+ * also keep "1000+ dd on a slow USB stick" under control.
+ */
+ if (task_ratelimit)
+ break;
+
+ /*
+ * In the case of an unresponding NFS server and the NFS dirty
+ * pages exceeds dirty_thresh, give the other good bdi's a pipe
+ * to go through, so that tasks on them still remain responsive.
+ *
+ * In theory 1 page is enough to keep the comsumer-producer
+ * pipe going: the flusher cleans 1 page => the task dirties 1
+ * more page. However bdi_dirty has accounting errors. So use
+ * the larger and more IO friendly bdi_stat_error.
+ */
+ if (bdi_dirty <= bdi_stat_error(bdi))
+ break;
+
+ if (fatal_signal_pending(current))
+ break;
}
- if (nr_reclaimable + global_page_state(NR_WRITEBACK)
- <= dirty_thresh && dirty_exceeded)
- dirty_exceeded = 0;
+ if (!dirty_exceeded && bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
- return; /* pdflush is already working this queue */
+ return;
/*
* In laptop mode, we wait until hitting the higher threshold before
@@ -239,9 +1537,11 @@ static void balance_dirty_pages(struct address_space *mapping)
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
- if ((laptop_mode && pages_written) ||
- (!laptop_mode && (nr_reclaimable > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ if (laptop_mode)
+ return;
+
+ if (nr_reclaimable > background_thresh)
+ bdi_start_background_writeback(bdi);
}
void set_page_dirty_balance(struct page *page)
@@ -254,10 +1554,27 @@ void set_page_dirty_balance(struct page *page)
}
}
+static DEFINE_PER_CPU(int, bdp_ratelimits);
+
+/*
+ * Normal tasks are throttled by
+ * loop {
+ * dirty tsk->nr_dirtied_pause pages;
+ * take a snap in balance_dirty_pages();
+ * }
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
/**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
@@ -268,41 +1585,60 @@ void set_page_dirty_balance(struct page *page)
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
- unsigned long nr_pages_dirtied)
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
- unsigned long ratelimit;
- unsigned long *p;
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ int ratelimit;
+ int *p;
+
+ if (!bdi_cap_account_dirty(bdi))
+ return;
- ratelimit = ratelimit_pages;
- if (dirty_exceeded)
- ratelimit = 8;
+ ratelimit = current->nr_dirtied_pause;
+ if (bdi->dirty_exceeded)
+ ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+ preempt_disable();
/*
- * Check the rate limiting. Also, we do not want to throttle real-time
- * tasks in balance_dirty_pages(). Period.
+ * This prevents one CPU to accumulate too many dirtied pages without
+ * calling into balance_dirty_pages(), which can happen when there are
+ * 1000+ tasks, all of them start dirtying pages at exactly the same
+ * time, hence all honoured too large initial task->nr_dirtied_pause.
*/
- preempt_disable();
- p = &__get_cpu_var(ratelimits);
- *p += nr_pages_dirtied;
- if (unlikely(*p >= ratelimit)) {
+ p = this_cpu_ptr(&bdp_ratelimits);
+ if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
- preempt_enable();
- balance_dirty_pages(mapping);
- return;
+ else if (unlikely(*p >= ratelimit_pages)) {
+ *p = 0;
+ ratelimit = 0;
+ }
+ /*
+ * Pick up the dirtied pages by the exited tasks. This avoids lots of
+ * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+ * the dirty throttling and livelock other long-run dirtiers.
+ */
+ p = this_cpu_ptr(&dirty_throttle_leaks);
+ if (*p > 0 && current->nr_dirtied < ratelimit) {
+ unsigned long nr_pages_dirtied;
+ nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+ *p -= nr_pages_dirtied;
+ current->nr_dirtied += nr_pages_dirtied;
}
preempt_enable();
+
+ if (unlikely(current->nr_dirtied >= ratelimit))
+ balance_dirty_pages(mapping, current->nr_dirtied);
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
-void throttle_vm_writeout(void)
+void throttle_vm_writeout(gfp_t gfp_mask)
{
- long background_thresh;
- long dirty_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ dirty_thresh = hard_dirty_limit(dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -313,156 +1649,42 @@ void throttle_vm_writeout(void)
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
- blk_congestion_wait(WRITE, HZ/10);
- }
-}
-
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
-/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
- */
-static void background_writeout(unsigned long _min_pages)
-{
- long min_pages = _min_pages;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .nr_to_write = 0,
- .nonblocking = 1,
- .range_cyclic = 1,
- };
-
- for ( ; ; ) {
- long background_thresh;
- long dirty_thresh;
-
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
- && min_pages <= 0)
+ /*
+ * The caller might hold locks which can prevent IO completion
+ * or progress in the filesystem. So we cannot just sit here
+ * waiting for IO to complete.
+ */
+ if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
break;
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- wbc.pages_skipped = 0;
- writeback_inodes(&wbc);
- min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- /* Wrote less than expected */
- blk_congestion_wait(WRITE, HZ/10);
- if (!wbc.encountered_congestion)
- break;
- }
- }
-}
-
-/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
- if (nr_pages == 0)
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
-}
-
-static void wb_timer_fn(unsigned long unused);
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
-/*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space. So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval. But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write. So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
- unsigned long oldest_jif;
- unsigned long start_jif;
- unsigned long next_jif;
- long nr_to_write;
- struct writeback_control wbc = {
- .bdi = NULL,
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = &oldest_jif,
- .nr_to_write = 0,
- .nonblocking = 1,
- .for_kupdate = 1,
- .range_cyclic = 1,
- };
-
- sync_supers();
-
- oldest_jif = jiffies - dirty_expire_interval;
- start_jif = jiffies;
- next_jif = start_jif + dirty_writeback_interval;
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- while (nr_to_write > 0) {
- wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_inodes(&wbc);
- if (wbc.nr_to_write > 0) {
- if (wbc.encountered_congestion)
- blk_congestion_wait(WRITE, HZ/10);
- else
- break; /* All the old data is written */
- }
- nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- }
- if (time_before(next_jif, jiffies + HZ))
- next_jif = jiffies + HZ;
- if (dirty_writeback_interval)
- mod_timer(&wb_timer, next_jif);
+ }
}
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
- if (dirty_writeback_interval) {
- mod_timer(&wb_timer,
- jiffies + dirty_writeback_interval);
- } else {
- del_timer(&wb_timer);
- }
+ proc_dointvec(table, write, buffer, length, ppos);
return 0;
}
-static void wb_timer_fn(unsigned long unused)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
- mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
-}
-
-static void laptop_flush(unsigned long unused)
-{
- sys_sync();
-}
+ struct request_queue *q = (struct request_queue *)data;
+ int nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
-static void laptop_timer_fn(unsigned long unused)
-{
- pdflush_operation(laptop_flush, 0);
+ /*
+ * We want to write everything out, not just down to the dirty
+ * threshold
+ */
+ if (bdi_has_dirty_io(&q->backing_dev_info))
+ bdi_start_writeback(&q->backing_dev_info, nr_pages,
+ WB_REASON_LAPTOP_TIMER);
}
/*
@@ -470,9 +1692,9 @@ static void laptop_timer_fn(unsigned long unused)
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+ mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
@@ -482,8 +1704,16 @@ void laptop_io_completion(void)
*/
void laptop_sync_completion(void)
{
- del_timer(&laptop_mode_wb_timer);
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ del_timer(&bdi->laptop_mode_wb_timer);
+
+ rcu_read_unlock();
}
+#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
@@ -493,65 +1723,318 @@ void laptop_sync_completion(void)
*
* Here we set ratelimit_pages to a level which ensures that when all CPUs are
* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high. Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time. So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
+ * thresholds.
*/
-static void set_ratelimit(void)
+void writeback_set_ratelimit(void)
{
- ratelimit_pages = total_pages / (num_online_cpus() * 32);
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ global_dirty_limit = dirty_thresh;
+ ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
- if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
- ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
}
-static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
- set_ratelimit();
- return 0;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ writeback_set_ratelimit();
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
}
-static struct notifier_block __cpuinitdata ratelimit_nb = {
+static struct notifier_block ratelimit_nb = {
.notifier_call = ratelimit_handler,
.next = NULL,
};
/*
- * If the machine has a large highmem:lowmem ratio then scale back the default
- * dirty memory thresholds: allowing too much dirty highmem pins an excessive
- * number of buffer_heads.
+ * Called early on to tune the page writeback dirty limits.
+ *
+ * We used to scale dirty pages according to how total memory
+ * related to pages that could be allocated for buffers (by
+ * comparing nr_free_buffer_pages() to vm_total_pages.
+ *
+ * However, that was when we used "dirty_ratio" to scale with
+ * all memory, and we don't do that any more. "dirty_ratio"
+ * is now applied to total non-HIGHPAGE memory (by subtracting
+ * totalhigh_pages from vm_total_pages), and as such we can't
+ * get into the old insane situation any more where we had
+ * large amounts of dirty pages compared to a small amount of
+ * non-HIGHMEM memory.
+ *
+ * But we might still want to scale the dirty_ratio by how
+ * much memory the box has..
*/
void __init page_writeback_init(void)
{
- long buffer_pages = nr_free_buffer_pages();
- long correction;
+ writeback_set_ratelimit();
+ register_cpu_notifier(&ratelimit_nb);
+
+ fprop_global_init(&writeout_completions);
+}
+
+/**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback. This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+ unsigned long tagged;
+
+ do {
+ spin_lock_irq(&mapping->tree_lock);
+ tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+ &start, end, WRITEBACK_TAG_BATCH,
+ PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+ cond_resched();
+ /* We check 'start' to handle wrapping when end == ~0UL */
+ } while (tagged >= WRITEBACK_TAG_BATCH && start);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them. If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
+ */
+int write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, writepage_t writepage,
+ void *data)
+{
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ pgoff_t done_index;
+ int cycled;
+ int range_whole = 0;
+ int tag;
+
+ pagevec_init(&pvec, 0);
+ if (wbc->range_cyclic) {
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ cycled = 1; /* ignore range_cyclic tests */
+ }
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ /*
+ * can't be range_cyclic (1st pass) because
+ * end == -1 in that case.
+ */
+ done = 1;
+ break;
+ }
+
+ done_index = page->index;
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
- total_pages = nr_free_pagecache_pages();
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
- correction = (100 * 4 * buffer_pages) / total_pages;
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
- if (correction < 100) {
- dirty_background_ratio *= correction;
- dirty_background_ratio /= 100;
- vm_dirty_ratio *= correction;
- vm_dirty_ratio /= 100;
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ ret = (*writepage)(page, wbc, data);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ done_index = page->index + 1;
+ done = 1;
+ break;
+ }
+ }
- if (dirty_background_ratio <= 0)
- dirty_background_ratio = 1;
- if (vm_dirty_ratio <= 0)
- vm_dirty_ratio = 1;
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
}
- mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
- set_ratelimit();
- register_cpu_notifier(&ratelimit_nb);
+ if (!cycled && !done) {
+ /*
+ * range_cyclic:
+ * We hit the last page and there is more work to be done: wrap
+ * back to the start of the file
+ */
+ cycled = 1;
+ index = 0;
+ end = writeback_index - 1;
+ goto retry;
+ }
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = done_index;
+
+ return ret;
}
+EXPORT_SYMBOL(write_cache_pages);
+
+/*
+ * Function used by generic_writepages to call the real writepage
+ * function and set the mapping flags on error
+ */
+static int __writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
+/**
+ * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+int generic_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct blk_plug plug;
+ int ret;
+
+ /* deal with chardevs and other special file */
+ if (!mapping->a_ops->writepage)
+ return 0;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
@@ -559,18 +2042,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0)
return 0;
- wbc->for_writepages = 1;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
- wbc->for_writepages = 0;
return ret;
}
/**
* write_one_page - write out a single page and optionally wait on I/O
- *
* @page: the page to write
* @wait: if true, wait on writeout
*
@@ -609,6 +2089,53 @@ int write_one_page(struct page *page, int wait)
EXPORT_SYMBOL(write_one_page);
/*
+ * For address_spaces which do not use buffers nor write back.
+ */
+int __set_page_dirty_no_writeback(struct page *page)
+{
+ if (!PageDirty(page))
+ return !TestSetPageDirty(page);
+ return 0;
+}
+
+/*
+ * Helper function for set_page_dirty family.
+ * NOTE: This relies on being atomic wrt interrupts.
+ */
+void account_page_dirtied(struct page *page, struct address_space *mapping)
+{
+ trace_writeback_dirty_page(page, mapping);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_DIRTIED);
+ __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ current->nr_dirtied++;
+ this_cpu_inc(bdp_ratelimits);
+ }
+}
+EXPORT_SYMBOL(account_page_dirtied);
+
+/*
+ * Helper function for set_page_writeback family.
+ *
+ * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
+ * while calling this function.
+ * See test_set_page_writeback for example.
+ *
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITEBACK);
+}
+EXPORT_SYMBOL(account_page_writeback);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -621,31 +2148,31 @@ EXPORT_SYMBOL(write_one_page);
* mapping is pinned by the vma's ->vm_file reference.
*
* We take care to handle the case where the page was truncated from the
- * mapping by re-checking page_mapping() insode tree_lock.
+ * mapping by re-checking page_mapping() inside tree_lock.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
+ unsigned long flags;
- if (mapping) {
- write_lock_irq(&mapping->tree_lock);
- mapping2 = page_mapping(page);
- if (mapping2) { /* Race with truncate? */
- BUG_ON(mapping2 != mapping);
- if (mapping_cap_account_dirty(mapping))
- __inc_zone_page_state(page,
- NR_FILE_DIRTY);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- write_unlock_irq(&mapping->tree_lock);
- if (mapping->host) {
- /* !PageAnon && !swapper_space */
- __mark_inode_dirty(mapping->host,
- I_DIRTY_PAGES);
- }
+ if (!mapping)
+ return 1;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ mapping2 = page_mapping(page);
+ if (mapping2) { /* Race with truncate? */
+ BUG_ON(mapping2 != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
@@ -654,6 +2181,24 @@ int __set_page_dirty_nobuffers(struct page *page)
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+void account_page_redirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ current->nr_dirtied--;
+ dec_zone_page_state(page, NR_DIRTIED);
+ dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ }
+}
+EXPORT_SYMBOL(account_page_redirty);
+
+/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -661,23 +2206,44 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
wbc->pages_skipped++;
+ account_page_redirty(page);
return __set_page_dirty_nobuffers(page);
}
EXPORT_SYMBOL(redirty_page_for_writepage);
/*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
* If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads.
*/
-int fastcall set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
{
struct address_space *mapping = page_mapping(page);
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
- if (spd)
- return (*spd)(page);
- return __set_page_dirty_buffers(page);
+ /*
+ * readahead/lru_deactivate_page could remain
+ * PG_readahead/PG_reclaim due to race with end_page_writeback
+ * About readahead, if the page is written, the flags would be
+ * reset. So no problem.
+ * About lru_deactivate_page, if the page is redirty, the flag
+ * will be reset. So no problem. but if the page is used by readahead
+ * it will confuse readahead and make it restart the size rampup
+ * process. But it's a trivial problem.
+ */
+ ClearPageReclaim(page);
+#ifdef CONFIG_BLOCK
+ if (!spd)
+ spd = __set_page_dirty_buffers;
+#endif
+ return (*spd)(page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
@@ -701,7 +2267,7 @@ int set_page_dirty_lock(struct page *page)
{
int ret;
- lock_page_nosync(page);
+ lock_page(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
@@ -709,39 +2275,6 @@ int set_page_dirty_lock(struct page *page)
EXPORT_SYMBOL(set_page_dirty_lock);
/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- */
-int test_clear_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
- unsigned long flags;
-
- if (mapping) {
- write_lock_irqsave(&mapping->tree_lock, flags);
- if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- /*
- * We can continue to use `mapping' here because the
- * page is locked, which pins the address_space
- */
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
- return 1;
- }
- write_unlock_irqrestore(&mapping->tree_lock, flags);
- return 0;
- }
- return TestClearPageDirty(page);
-}
-EXPORT_SYMBOL(test_clear_page_dirty);
-
-/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
@@ -759,12 +2292,50 @@ int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ BUG_ON(!PageLocked(page));
+
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ /*
+ * Yes, Virginia, this is indeed insane.
+ *
+ * We use this sequence to make sure that
+ * (a) we account for dirty stats properly
+ * (b) we tell the low-level filesystem to
+ * mark the whole page dirty if it was
+ * dirty in a pagetable. Only to then
+ * (c) clean the page again and return 1 to
+ * cause the writeback.
+ *
+ * This way we avoid all nasty races with the
+ * dirty bit in multiple places and clearing
+ * them concurrently from different threads.
+ *
+ * Note! Normally the "set_page_dirty(page)"
+ * has no effect on the actual dirty bit - since
+ * that will already usually be set. But we
+ * need the side effects, and it can help us
+ * avoid races.
+ *
+ * We basically use the page "master dirty bit"
+ * as a serialization point for all the different
+ * threads doing their things.
+ */
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ /*
+ * We carefully synchronise fault handlers against
+ * installing a dirty pte and marking the page dirty
+ * at this point. We do this by having them hold the
+ * page lock at some point after installing their
+ * pte, but before marking the page dirty.
+ * Pages are always locked coming in here, so we get
+ * the desired exclusion. See mm/memory.c:do_wp_page()
+ * for more comments.
+ */
if (TestClearPageDirty(page)) {
- if (mapping_cap_account_dirty(mapping)) {
- page_mkclean(page);
- dec_zone_page_state(page, NR_FILE_DIRTY);
- }
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
return 1;
}
return 0;
@@ -777,71 +2348,105 @@ int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
int ret;
+ bool locked;
+ unsigned long memcg_flags;
+ mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
- write_lock_irqsave(&mapping->tree_lock, flags);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
- if (ret)
+ if (ret) {
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
+ if (bdi_cap_account_writeback(bdi)) {
+ __dec_bdi_stat(bdi, BDI_WRITEBACK);
+ __bdi_writeout_inc(bdi);
+ }
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
}
+ if (ret) {
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ dec_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+ }
+ mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
-int test_set_page_writeback(struct page *page)
+int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret;
+ bool locked;
+ unsigned long memcg_flags;
+ mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
if (mapping) {
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
- write_lock_irqsave(&mapping->tree_lock, flags);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
- if (!ret)
+ if (!ret) {
radix_tree_tag_set(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_account_writeback(bdi))
+ __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ }
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- write_unlock_irqrestore(&mapping->tree_lock, flags);
+ if (!keep_write)
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
}
+ if (!ret)
+ account_page_writeback(page);
+ mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
}
-EXPORT_SYMBOL(test_set_page_writeback);
+EXPORT_SYMBOL(__test_set_page_writeback);
/*
- * Wakes up tasks that are being throttled due to writeback congestion
+ * Return true if any of the pages in the mapping are marked with the
+ * passed tag.
*/
-void writeback_congestion_end(void)
+int mapping_tagged(struct address_space *mapping, int tag)
{
- blk_congestion_end(WRITE);
+ return radix_tree_tagged(&mapping->page_tree, tag);
}
-EXPORT_SYMBOL(writeback_congestion_end);
+EXPORT_SYMBOL(mapping_tagged);
-/*
- * Return true if any of the pages in the mapping are marged with the
- * passed tag.
+/**
+ * wait_for_stable_page() - wait for writeback to finish, if necessary.
+ * @page: The page to wait on.
+ *
+ * This function determines if the given page is related to a backing device
+ * that requires page contents to be held stable during writeback. If so, then
+ * it will wait for any pending writeback to complete.
*/
-int mapping_tagged(struct address_space *mapping, int tag)
+void wait_for_stable_page(struct page *page)
{
- unsigned long flags;
- int ret;
+ struct address_space *mapping = page_mapping(page);
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
- read_lock_irqsave(&mapping->tree_lock, flags);
- ret = radix_tree_tagged(&mapping->page_tree, tag);
- read_unlock_irqrestore(&mapping->tree_lock, flags);
- return ret;
+ if (!bdi_cap_stable_pages_required(bdi))
+ return;
+
+ wait_on_page_writeback(page);
}
-EXPORT_SYMBOL(mapping_tagged);
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4f59d90b81e..ef44ad736ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,14 +19,19 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
+#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/oom.h>
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
@@ -35,27 +40,128 @@
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
-
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
+#include <linux/page-isolation.h>
+#include <linux/page_cgroup.h>
+#include <linux/debugobjects.h>
+#include <linux/kmemleak.h>
+#include <linux/compaction.h>
+#include <trace/events/kmem.h>
+#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
+#include <linux/mm_inline.h>
+#include <linux/migrate.h>
+#include <linux/page-debug-flags.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/rt.h>
+
+#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
+static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
+
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
+ * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
+ * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
+ * defined in <linux/topology.h>.
+ */
+DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
+EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+#endif
+
/*
- * MCD - HACK: Find somewhere to initialize this EARLY, or make this
- * initializer cleaner
+ * Array of node states.
*/
-nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
-EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
-EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
+ [N_POSSIBLE] = NODE_MASK_ALL,
+ [N_ONLINE] = { { [0] = 1UL } },
+#ifndef CONFIG_NUMA
+ [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+ [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+#ifdef CONFIG_MOVABLE_NODE
+ [N_MEMORY] = { { [0] = 1UL } },
+#endif
+ [N_CPU] = { { [0] = 1UL } },
+#endif /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
+
+/* Protect totalram_pages and zone->managed_pages */
+static DEFINE_SPINLOCK(managed_page_count_lock);
+
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
-long nr_swap_pages;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory. This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
+}
+
+void pm_restrict_gfp_mask(void)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
+}
+
+bool pm_suspended_storage(void)
+{
+ if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ return false;
+ return true;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+int pageblock_order __read_mostly;
+#endif
static void __free_pages_ok(struct page *page, unsigned int order);
@@ -71,71 +177,73 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* don't need any ZONE_NORMAL reservation
*/
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
+#ifdef CONFIG_ZONE_DMA
256,
+#endif
#ifdef CONFIG_ZONE_DMA32
256,
#endif
#ifdef CONFIG_HIGHMEM
- 32
+ 32,
#endif
+ 32,
};
EXPORT_SYMBOL(totalram_pages);
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
+#ifdef CONFIG_ZONE_DMA
"DMA",
+#endif
#ifdef CONFIG_ZONE_DMA32
"DMA32",
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
- "HighMem"
+ "HighMem",
#endif
+ "Movable",
};
int min_free_kbytes = 1024;
+int user_min_free_kbytes = -1;
+
+static unsigned long __meminitdata nr_kernel_pages;
+static unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata dma_reserve;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+int nr_online_nodes __read_mostly = 1;
+EXPORT_SYMBOL(nr_node_ids);
+EXPORT_SYMBOL(nr_online_nodes);
+#endif
+
+int page_group_by_mobility_disabled __read_mostly;
-unsigned long __meminitdata nr_kernel_pages;
-unsigned long __meminitdata nr_all_pages;
-static unsigned long __initdata dma_reserve;
-
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
- /*
- * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
- * ranges of memory (RAM) that may be registered with add_active_range().
- * Ranges passed to add_active_range() will be merged if possible
- * so the number of times add_active_range() can be called is
- * related to the number of nodes and the number of holes
- */
- #ifdef CONFIG_MAX_ACTIVE_REGIONS
- /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
- #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
- #else
- #if MAX_NUMNODES >= 32
- /* If there can be many nodes, allow up to 50 holes per node */
- #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
- #else
- /* By default, allow up to 256 distinct regions */
- #define MAX_ACTIVE_REGIONS 256
- #endif
- #endif
-
- struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
- int __initdata nr_nodemap_entries;
- unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
- unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
- unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+void set_pageblock_migratetype(struct page *page, int migratetype)
+{
+ if (unlikely(page_group_by_mobility_disabled &&
+ migratetype < MIGRATE_PCPTYPES))
+ migratetype = MIGRATE_UNMOVABLE;
+
+ set_pageblock_flags_group(page, (unsigned long)migratetype,
+ PB_migrate, PB_migrate_end);
+}
+
+bool oom_killer_disabled __read_mostly;
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -143,24 +251,28 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
int ret = 0;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
+ unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
- if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
- ret = 1;
- else if (pfn < zone->zone_start_pfn)
+ start_pfn = zone->zone_start_pfn;
+ sp = zone->spanned_pages;
+ if (!zone_spans_pfn(zone, pfn))
ret = 1;
} while (zone_span_seqretry(zone, seq));
+ if (ret)
+ pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
+ pfn, zone_to_nid(zone), zone->name,
+ start_pfn, start_pfn + sp);
+
return ret;
}
static int page_is_consistent(struct zone *zone, struct page *page)
{
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(page)))
+ if (!pfn_valid_within(page_to_pfn(page)))
return 0;
-#endif
if (zone != page_zone(page))
return 0;
@@ -185,30 +297,49 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, const char *reason,
+ unsigned long bad_flags)
{
- printk(KERN_EMERG "Bad page state in process '%s'\n"
- KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
- KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
- KERN_EMERG "Backtrace:\n",
- current->comm, page, (int)(2*sizeof(unsigned long)),
- (unsigned long)page->flags, page->mapping,
- page_mapcount(page), page_count(page));
+ static unsigned long resume;
+ static unsigned long nr_shown;
+ static unsigned long nr_unshown;
+
+ /* Don't complain about poisoned pages */
+ if (PageHWPoison(page)) {
+ page_mapcount_reset(page); /* remove PageBuddy */
+ return;
+ }
+
+ /*
+ * Allow a burst of 60 reports, then keep quiet for that minute;
+ * or allow a steady drip of one report per second.
+ */
+ if (nr_shown == 60) {
+ if (time_before(jiffies, resume)) {
+ nr_unshown++;
+ goto out;
+ }
+ if (nr_unshown) {
+ printk(KERN_ALERT
+ "BUG: Bad page state: %lu messages suppressed\n",
+ nr_unshown);
+ nr_unshown = 0;
+ }
+ nr_shown = 0;
+ }
+ if (nr_shown++ == 0)
+ resume = jiffies + 60 * HZ;
+
+ printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ dump_page_badflags(page, reason, bad_flags);
+
+ print_modules();
dump_stack();
- page->flags &= ~(1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_dirty |
- 1 << PG_reclaim |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_buddy );
- set_page_count(page, 0);
- reset_page_mapcount(page);
- page->mapping = NULL;
- add_taint(TAINT_BAD_PAGE);
+out:
+ /* Leave bad fields for debug, except PageBuddy could make trouble */
+ page_mapcount_reset(page); /* remove PageBuddy */
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
@@ -218,8 +349,8 @@ static void bad_page(struct page *page)
*
* The remaining PAGE_SIZE pages are called "tail pages".
*
- * All pages have PG_compound set. All pages have their ->private pointing at
- * the head page (even the head page has this).
+ * All pages have PG_compound set. All tail pages have their ->first_page
+ * pointing at the head page.
*
* The first tail page's ->lru.next holds the address of the compound page's
* put_page() function. Its ->lru.prev holds the order of allocation.
@@ -228,47 +359,62 @@ static void bad_page(struct page *page)
static void free_compound_page(struct page *page)
{
- __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+ __free_pages_ok(page, compound_order(page));
}
-static void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- page[1].lru.next = (void *)free_compound_page; /* set dtor */
- page[1].lru.prev = (void *)order;
- for (i = 0; i < nr_pages; i++) {
+ set_compound_page_dtor(page, free_compound_page);
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
-
- __SetPageCompound(p);
- set_page_private(p, (unsigned long)page);
+ set_page_count(p, 0);
+ p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
-static void destroy_compound_page(struct page *page, unsigned long order)
+/* update __split_huge_page_refcount if you change this function */
+static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
+ int bad = 0;
+
+ if (unlikely(compound_order(page) != order)) {
+ bad_page(page, "wrong compound order", 0);
+ bad++;
+ }
- if (unlikely((unsigned long)page[1].lru.prev != order))
- bad_page(page);
+ __ClearPageHead(page);
- for (i = 0; i < nr_pages; i++) {
+ for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageCompound(p) |
- (page_private(p) != (unsigned long)page)))
- bad_page(page);
- __ClearPageCompound(p);
+ if (unlikely(!PageTail(p))) {
+ bad_page(page, "PageTail not set", 0);
+ bad++;
+ } else if (unlikely(p->first_page != page)) {
+ bad_page(page, "first_page not consistent", 0);
+ bad++;
+ }
+ __ClearPageTail(p);
}
+
+ return bad;
}
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
{
int i;
- VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
/*
* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
* and __GFP_HIGHMEM from hard or soft interrupt context.
@@ -278,17 +424,38 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
clear_highpage(page + i);
}
-/*
- * function for dealing with page's order in buddy system.
- * zone->lock is already acquired when we use these.
- * So, we don't need atomic page->flags operations here.
- */
-static inline unsigned long page_order(struct page *page)
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+ unsigned long res;
+
+ if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
+ printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ return 0;
+ }
+ _debug_guardpage_minorder = res;
+ printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
{
- return page_private(page);
+ __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
}
-static inline void set_page_order(struct page *page, int order)
+static inline void clear_page_guard_flag(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
+static inline void set_page_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -317,18 +484,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -339,24 +498,39 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE.
+ * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
+ * serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
- int order)
+ unsigned int order)
{
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(buddy)))
+ if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
-#endif
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
+ if (page_is_guard(buddy) && page_order(buddy) == order) {
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
+ return 1;
+ }
if (PageBuddy(buddy) && page_order(buddy) == order) {
- BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ /*
+ * zone check is done late to avoid uselessly
+ * calculating zone/node ids for pages that could
+ * never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
return 0;
@@ -375,84 +549,122 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
- * order is recorded in page_private(page) field.
+ * free pages of length of (1 << order) and marked with _mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
+ * field.
* So when we are allocating or freeing one, we can derive the state of the
- * other. That is, if we allocate a small block, and both were
- * free, the remainder of the region must be split into blocks.
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.
+ * triggers coalescing into a block of larger size.
*
- * -- wli
+ * -- nyc
*/
static inline void __free_one_page(struct page *page,
- struct zone *zone, unsigned int order)
+ unsigned long pfn,
+ struct zone *zone, unsigned int order,
+ int migratetype)
{
unsigned long page_idx;
- int order_size = 1 << order;
+ unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
+ struct page *buddy;
+
+ VM_BUG_ON(!zone_is_initialized(zone));
if (unlikely(PageCompound(page)))
- destroy_compound_page(page, order);
+ if (unlikely(destroy_compound_page(page, order)))
+ return;
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ VM_BUG_ON(migratetype == -1);
- VM_BUG_ON(page_idx & (order_size - 1));
- VM_BUG_ON(bad_range(zone, page));
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
- zone->free_pages += order_size;
- while (order < MAX_ORDER-1) {
- unsigned long combined_idx;
- struct free_area *area;
- struct page *buddy;
+ VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
- buddy = __page_find_buddy(page, page_idx, order);
+ while (order < MAX_ORDER-1) {
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
- break; /* Move the buddy up one level. */
-
- list_del(&buddy->lru);
- area = zone->free_area + order;
- area->nr_free--;
- rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ break;
+ /*
+ * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+ * merge with it and move up one order.
+ */
+ if (page_is_guard(buddy)) {
+ clear_page_guard_flag(buddy);
+ set_page_private(page, 0);
+ __mod_zone_freepage_state(zone, 1 << order,
+ migratetype);
+ } else {
+ list_del(&buddy->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(buddy);
+ }
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
- list_add(&page->lru, &zone->free_area[order].free_list);
+
+ /*
+ * If this is not the largest possible page, check if the buddy
+ * of the next-highest order is free. If it is, it's possible
+ * that pages are being freed that will coalesce soon. In case,
+ * that is happening, add the free page to the tail of the list
+ * so it's less likely to be used soon and more likely to be merged
+ * as a higher order page
+ */
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+ struct page *higher_page, *higher_buddy;
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = higher_page + (buddy_idx - combined_idx);
+ if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+ list_add_tail(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ goto out;
+ }
+ }
+
+ list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
zone->free_area[order].nr_free++;
}
static inline int free_pages_check(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (page_count(page) != 0) |
- (page->flags & (
- 1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_reclaim |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_reserved |
- 1 << PG_buddy ))))
- bad_page(page);
- if (PageDirty(page))
- __ClearPageDirty(page);
- /*
- * For now, we report if PG_reserved was found set, but do not
- * clear it, and do not free the page. But we shall soon need
- * to do more, for when the ZERO_PAGE count wraps negative.
- */
- return PageReserved(page);
+ const char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
+ return 1;
+ }
+ page_cpupid_reset_last(page);
+ if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
+ page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ return 0;
}
/*
- * Frees a list of pages.
+ * Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -462,84 +674,166 @@ static inline int free_pages_check(struct page *page)
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pages_bulk(struct zone *zone, int count,
- struct list_head *list, int order)
+static void free_pcppages_bulk(struct zone *zone, int count,
+ struct per_cpu_pages *pcp)
{
+ int migratetype = 0;
+ int batch_free = 0;
+ int to_free = count;
+
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- while (count--) {
+
+ while (to_free) {
struct page *page;
+ struct list_head *list;
- VM_BUG_ON(list_empty(list));
- page = list_entry(list->prev, struct page, lru);
- /* have to delete it as __free_one_page list manipulates */
- list_del(&page->lru);
- __free_one_page(page, zone, order);
+ /*
+ * Remove pages from lists in a round-robin fashion. A
+ * batch_free count is maintained that is incremented when an
+ * empty list is encountered. This is so more pages are freed
+ * off fuller lists instead of spinning excessively around empty
+ * lists
+ */
+ do {
+ batch_free++;
+ if (++migratetype == MIGRATE_PCPTYPES)
+ migratetype = 0;
+ list = &pcp->lists[migratetype];
+ } while (list_empty(list));
+
+ /* This is the only non-empty list. Free them all. */
+ if (batch_free == MIGRATE_PCPTYPES)
+ batch_free = to_free;
+
+ do {
+ int mt; /* migratetype of the to-be-freed page */
+
+ page = list_entry(list->prev, struct page, lru);
+ /* must delete as __free_one_page list manipulates */
+ list_del(&page->lru);
+ mt = get_freepage_migratetype(page);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ if (likely(!is_migrate_isolate_page(page))) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+ }
+ } while (--to_free && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
-static void free_one_page(struct zone *zone, struct page *page, int order)
+static void free_one_page(struct zone *zone,
+ struct page *page, unsigned long pfn,
+ unsigned int order,
+ int migratetype)
{
spin_lock(&zone->lock);
- zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __free_one_page(page, zone ,order);
+
+ __free_one_page(page, pfn, zone, order, migratetype);
+ if (unlikely(!is_migrate_isolate(migratetype)))
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
{
- unsigned long flags;
int i;
- int reserved = 0;
+ int bad = 0;
- arch_free_page(page, order);
- if (!PageHighMem(page))
+ trace_mm_page_free(page, order);
+ kmemcheck_free_shadow(page, order);
+
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page + i);
+ if (bad)
+ return false;
+
+ if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE<<order);
+ PAGE_SIZE << order);
+ debug_check_no_obj_freed(page_address(page),
+ PAGE_SIZE << order);
+ }
+ arch_free_page(page, order);
+ kernel_map_pages(page, 1 << order, 0);
- for (i = 0 ; i < (1 << order) ; ++i)
- reserved += free_pages_check(page + i);
- if (reserved)
+ return true;
+}
+
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+ unsigned long flags;
+ int migratetype;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_pages_prepare(page, order))
return;
- kernel_map_pages(page, 1 << order, 0);
+ migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, order);
+ set_freepage_migratetype(page, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
-void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
{
- if (order == 0) {
- __ClearPageReserved(page);
- set_page_count(page, 0);
- set_page_refcounted(page);
- __free_page(page);
- } else {
- int loop;
+ unsigned int nr_pages = 1 << order;
+ struct page *p = page;
+ unsigned int loop;
+
+ prefetchw(p);
+ for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
- prefetchw(page);
- for (loop = 0; loop < BITS_PER_LONG; loop++) {
- struct page *p = &page[loop];
+ page_zone(page)->managed_pages += nr_pages;
+ set_page_refcounted(page);
+ __free_pages(page, order);
+}
- if (loop + 1 < BITS_PER_LONG)
- prefetchw(p + 1);
- __ClearPageReserved(p);
- set_page_count(p, 0);
- }
+#ifdef CONFIG_CMA
+/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+ unsigned i = pageblock_nr_pages;
+ struct page *p = page;
+ do {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ } while (++p, --i);
+
+ set_pageblock_migratetype(page, MIGRATE_CMA);
+
+ if (pageblock_order >= MAX_ORDER) {
+ i = pageblock_nr_pages;
+ p = page;
+ do {
+ set_page_refcounted(p);
+ __free_pages(p, MAX_ORDER - 1);
+ p += MAX_ORDER_NR_PAGES;
+ } while (i -= MAX_ORDER_NR_PAGES);
+ } else {
set_page_refcounted(page);
- __free_pages(page, order);
+ __free_pages(page, pageblock_order);
}
-}
+ adjust_managed_page_count(page, pageblock_nr_pages);
+}
+#endif
/*
* The order of subdivision here is critical for the IO subsystem.
@@ -553,10 +847,11 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
- * -- wli
+ * -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area)
+ int low, int high, struct free_area *area,
+ int migratetype)
{
unsigned long size = 1 << high;
@@ -564,8 +859,26 @@ static inline void expand(struct zone *zone, struct page *page,
area--;
high--;
size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
- list_add(&page[size].lru, &area->free_list);
+ VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (high < debug_guardpage_minorder()) {
+ /*
+ * Mark as guard pages (or page), that will allow to
+ * merge back to allocator when buddy will be freed.
+ * Corresponding page table entries will not be touched,
+ * pages will stay not present in virtual address space
+ */
+ INIT_LIST_HEAD(&page[size].lru);
+ set_page_guard_flag(&page[size]);
+ set_page_private(&page[size], high);
+ /* Guard pages are not available for any usage */
+ __mod_zone_freepage_state(zone, -(1 << high),
+ migratetype);
+ continue;
+ }
+#endif
+ list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
@@ -574,37 +887,44 @@ static inline void expand(struct zone *zone, struct page *page,
/*
* This page is about to be returned from the page allocator
*/
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
-{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (page_count(page) != 0) |
- (page->flags & (
- 1 << PG_lru |
- 1 << PG_private |
- 1 << PG_locked |
- 1 << PG_active |
- 1 << PG_dirty |
- 1 << PG_reclaim |
- 1 << PG_slab |
- 1 << PG_swapcache |
- 1 << PG_writeback |
- 1 << PG_reserved |
- 1 << PG_buddy ))))
- bad_page(page);
-
- /*
- * For now, we report if PG_reserved was found set, but do not
- * clear it, and do not allocate the page: as a safety net.
- */
- if (PageReserved(page))
+static inline int check_new_page(struct page *page)
+{
+ const char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
+ }
+ return 0;
+}
+
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+{
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+ if (unlikely(check_new_page(p)))
+ return 1;
+ }
- page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
- 1 << PG_referenced | 1 << PG_arch_1 |
- 1 << PG_checked | 1 << PG_mappedtodisk);
set_page_private(page, 0);
set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
if (gfp_flags & __GFP_ZERO)
@@ -616,201 +936,522 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
return 0;
}
-/*
- * Do the hard work of removing an element from the buddy allocator.
- * Call me with the zone->lock already held.
+/*
+ * Go through the free lists for the given migratetype and remove
+ * the smallest available page from the freelists
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static inline
+struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+ int migratetype)
{
- struct free_area * area;
unsigned int current_order;
+ struct free_area *area;
struct page *page;
+ /* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
- if (list_empty(&area->free_list))
+ area = &(zone->free_area[current_order]);
+ if (list_empty(&area->free_list[migratetype]))
continue;
- page = list_entry(area->free_list.next, struct page, lru);
+ page = list_entry(area->free_list[migratetype].next,
+ struct page, lru);
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
- zone->free_pages -= 1UL << order;
- expand(zone, page, order, current_order, area);
+ expand(zone, page, order, current_order, area, migratetype);
+ set_freepage_migratetype(page, migratetype);
return page;
}
return NULL;
}
-/*
+
+/*
+ * This array describes the order lists are fallen back to when
+ * the free lists for the desirable migrate type are depleted
+ */
+static int fallbacks[MIGRATE_TYPES][4] = {
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+#ifdef CONFIG_CMA
+ [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
+#else
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+#endif
+ [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
+#endif
+};
+
+/*
+ * Move the free pages in a range to the free lists of the requested type.
+ * Note that start_page and end_pages are not aligned on a pageblock
+ * boundary. If alignment is required, use move_freepages_block()
+ */
+int move_freepages(struct zone *zone,
+ struct page *start_page, struct page *end_page,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long order;
+ int pages_moved = 0;
+
+#ifndef CONFIG_HOLES_IN_ZONE
+ /*
+ * page_zone is not safe to call in this context when
+ * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
+ * anyway as we check zone boundaries in move_freepages_block().
+ * Remove at a later date when no bug reports exist related to
+ * grouping pages by mobility
+ */
+ BUG_ON(page_zone(start_page) != page_zone(end_page));
+#endif
+
+ for (page = start_page; page <= end_page;) {
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
+ if (!pfn_valid_within(page_to_pfn(page))) {
+ page++;
+ continue;
+ }
+
+ if (!PageBuddy(page)) {
+ page++;
+ continue;
+ }
+
+ order = page_order(page);
+ list_move(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
+ set_freepage_migratetype(page, migratetype);
+ page += 1 << order;
+ pages_moved += 1 << order;
+ }
+
+ return pages_moved;
+}
+
+int move_freepages_block(struct zone *zone, struct page *page,
+ int migratetype)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *start_page, *end_page;
+
+ start_pfn = page_to_pfn(page);
+ start_pfn = start_pfn & ~(pageblock_nr_pages-1);
+ start_page = pfn_to_page(start_pfn);
+ end_page = start_page + pageblock_nr_pages - 1;
+ end_pfn = start_pfn + pageblock_nr_pages - 1;
+
+ /* Do not cross zone boundaries */
+ if (!zone_spans_pfn(zone, start_pfn))
+ start_page = page;
+ if (!zone_spans_pfn(zone, end_pfn))
+ return 0;
+
+ return move_freepages(zone, start_page, end_page, migratetype);
+}
+
+static void change_pageblock_range(struct page *pageblock_page,
+ int start_order, int migratetype)
+{
+ int nr_pageblocks = 1 << (start_order - pageblock_order);
+
+ while (nr_pageblocks--) {
+ set_pageblock_migratetype(pageblock_page, migratetype);
+ pageblock_page += pageblock_nr_pages;
+ }
+}
+
+/*
+ * If breaking a large block of pages, move all free pages to the preferred
+ * allocation list. If falling back for a reclaimable kernel allocation, be
+ * more aggressive about taking ownership of free pages.
+ *
+ * On the other hand, never change migration type of MIGRATE_CMA pageblocks
+ * nor move CMA pages to different free lists. We don't want unmovable pages
+ * to be allocated from MIGRATE_CMA areas.
+ *
+ * Returns the new migratetype of the pageblock (or the same old migratetype
+ * if it was unchanged).
+ */
+static int try_to_steal_freepages(struct zone *zone, struct page *page,
+ int start_type, int fallback_type)
+{
+ int current_order = page_order(page);
+
+ /*
+ * When borrowing from MIGRATE_CMA, we need to release the excess
+ * buddy pages to CMA itself. We also ensure the freepage_migratetype
+ * is set to CMA so it is returned to the correct freelist in case
+ * the page ends up being not actually allocated from the pcp lists.
+ */
+ if (is_migrate_cma(fallback_type))
+ return fallback_type;
+
+ /* Take ownership for orders >= pageblock_order */
+ if (current_order >= pageblock_order) {
+ change_pageblock_range(page, current_order, start_type);
+ return start_type;
+ }
+
+ if (current_order >= pageblock_order / 2 ||
+ start_type == MIGRATE_RECLAIMABLE ||
+ page_group_by_mobility_disabled) {
+ int pages;
+
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled) {
+
+ set_pageblock_migratetype(page, start_type);
+ return start_type;
+ }
+
+ }
+
+ return fallback_type;
+}
+
+/* Remove an element from the buddy allocator from the fallback list */
+static inline struct page *
+__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+{
+ struct free_area *area;
+ unsigned int current_order;
+ struct page *page;
+ int migratetype, new_type, i;
+
+ /* Find the largest possible block of pages in the other list */
+ for (current_order = MAX_ORDER-1;
+ current_order >= order && current_order <= MAX_ORDER-1;
+ --current_order) {
+ for (i = 0;; i++) {
+ migratetype = fallbacks[start_migratetype][i];
+
+ /* MIGRATE_RESERVE handled later if necessary */
+ if (migratetype == MIGRATE_RESERVE)
+ break;
+
+ area = &(zone->free_area[current_order]);
+ if (list_empty(&area->free_list[migratetype]))
+ continue;
+
+ page = list_entry(area->free_list[migratetype].next,
+ struct page, lru);
+ area->nr_free--;
+
+ new_type = try_to_steal_freepages(zone, page,
+ start_migratetype,
+ migratetype);
+
+ /* Remove the page from the freelists */
+ list_del(&page->lru);
+ rmv_page_order(page);
+
+ expand(zone, page, order, current_order, area,
+ new_type);
+ /* The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages. This is OK as long as it does
+ * not differ for MIGRATE_CMA type.
+ */
+ set_freepage_migratetype(page, new_type);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype, new_type);
+
+ return page;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order,
+ int migratetype)
+{
+ struct page *page;
+
+retry_reserve:
+ page = __rmqueue_smallest(zone, order, migratetype);
+
+ if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+ page = __rmqueue_fallback(zone, order, migratetype);
+
+ /*
+ * Use MIGRATE_RESERVE rather than fail an allocation. goto
+ * is used because __rmqueue_smallest is an inline function
+ * and we want just one call site
+ */
+ if (!page) {
+ migratetype = MIGRATE_RESERVE;
+ goto retry_reserve;
+ }
+ }
+
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ return page;
+}
+
+/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
-static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ unsigned long count, struct list_head *list,
+ int migratetype, bool cold)
{
int i;
-
+
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order);
+ struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
break;
- list_add_tail(&page->lru, list);
+
+ /*
+ * Split buddy pages returned by expand() are received here
+ * in physical page order. The page is added to the callers and
+ * list and the list head then moves forward. From the callers
+ * perspective, the linked list is ordered by page number in
+ * some conditions. This is useful for IO devices that can
+ * merge IO requests if the physical pages are ordered
+ * properly.
+ */
+ if (likely(!cold))
+ list_add(&page->lru, list);
+ else
+ list_add_tail(&page->lru, list);
+ list = &page->lru;
+ if (is_migrate_cma(get_freepage_migratetype(page)))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ -(1 << order));
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return i;
}
#ifdef CONFIG_NUMA
/*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
+ * Called from the vmstat counter updater to drain pagesets of this
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
* Note that this function must be called with the thread pinned to
* a single processor.
*/
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- int i;
- enum zone_type z;
unsigned long flags;
+ int to_drain;
+ unsigned long batch;
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
- struct per_cpu_pageset *pset;
-
- if (!populated_zone(zone))
- continue;
-
- pset = zone_pcp(zone, smp_processor_id());
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
-
- pcp = &pset->pcp[i];
- if (pcp->count) {
- local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
- local_irq_restore(flags);
- }
- }
+ local_irq_save(flags);
+ batch = ACCESS_ONCE(pcp->batch);
+ if (pcp->count >= batch)
+ to_drain = batch;
+ else
+ to_drain = pcp->count;
+ if (to_drain > 0) {
+ free_pcppages_bulk(zone, to_drain, pcp);
+ pcp->count -= to_drain;
}
+ local_irq_restore(flags);
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
-static void __drain_pages(unsigned int cpu)
+/*
+ * Drain pages of the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
{
unsigned long flags;
struct zone *zone;
- int i;
- for_each_zone(zone) {
+ for_each_populated_zone(zone) {
struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
- pset = zone_pcp(zone, cpu);
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
+ local_irq_save(flags);
+ pset = per_cpu_ptr(zone->pageset, cpu);
- pcp = &pset->pcp[i];
- local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ pcp = &pset->pcp;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
- local_irq_restore(flags);
}
+ local_irq_restore(flags);
}
}
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_PM
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void *arg)
+{
+ drain_pages(smp_processor_id());
+}
+
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
+ */
+void drain_all_pages(void)
+{
+ int cpu;
+ struct per_cpu_pageset *pcp;
+ struct zone *zone;
+
+ /*
+ * Allocate in the BSS so we wont require allocation in
+ * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+ */
+ static cpumask_t cpus_with_pcps;
+
+ /*
+ * We don't care about racing with CPU hotplug event
+ * as offline notification will cause the notified
+ * cpu to drain that CPU pcps and on_each_cpu_mask
+ * disables preemption as part of its processing
+ */
+ for_each_online_cpu(cpu) {
+ bool has_pcps = false;
+ for_each_populated_zone(zone) {
+ pcp = per_cpu_ptr(zone->pageset, cpu);
+ if (pcp->pcp.count) {
+ has_pcps = true;
+ break;
+ }
+ }
+ if (has_pcps)
+ cpumask_set_cpu(cpu, &cpus_with_pcps);
+ else
+ cpumask_clear_cpu(cpu, &cpus_with_pcps);
+ }
+ on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
+}
+
+#ifdef CONFIG_HIBERNATION
void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn;
unsigned long flags;
- int order;
+ unsigned int order, t;
struct list_head *curr;
- if (!zone->spanned_pages)
+ if (zone_is_empty(zone))
return;
spin_lock_irqsave(&zone->lock, flags);
- max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
- if (!PageNosave(page))
- ClearPageNosaveFree(page);
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
}
- for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
+ for_each_migratetype_order(order, t) {
+ list_for_each(curr, &zone->free_area[order].free_list[t]) {
unsigned long i;
pfn = page_to_pfn(list_entry(curr, struct page, lru));
for (i = 0; i < (1UL << order); i++)
- SetPageNosaveFree(pfn_to_page(pfn + i));
+ swsusp_set_page_free(pfn_to_page(pfn + i));
}
-
+ }
spin_unlock_irqrestore(&zone->lock, flags);
}
-
-/*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __drain_pages(smp_processor_id());
- local_irq_restore(flags);
-}
#endif /* CONFIG_PM */
/*
* Free a 0-order page
+ * cold == true ? free a cold page : free a hot page
*/
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+ int migratetype;
- arch_free_page(page, 0);
-
- if (PageAnon(page))
- page->mapping = NULL;
- if (free_pages_check(page))
+ if (!free_pages_prepare(page, 0))
return;
- kernel_map_pages(page, 1, 0);
-
- pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+ migratetype = get_pfnblock_migratetype(page, pfn);
+ set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
- list_add(&page->lru, &pcp->list);
+
+ /*
+ * We only track unmovable, reclaimable and movable on pcp lists.
+ * Free ISOLATE pages back to the allocator because they are being
+ * offlined but treat RESERVE as movable pages so we can get those
+ * areas back if necessary. Otherwise, we may have to free
+ * excessively into the page allocator
+ */
+ if (migratetype >= MIGRATE_PCPTYPES) {
+ if (unlikely(is_migrate_isolate(migratetype))) {
+ free_one_page(zone, page, pfn, 0, migratetype);
+ goto out;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ if (!cold)
+ list_add(&page->lru, &pcp->lists[migratetype]);
+ else
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
- pcp->count -= pcp->batch;
+ unsigned long batch = ACCESS_ONCE(pcp->batch);
+ free_pcppages_bulk(zone, batch, pcp);
+ pcp->count -= batch;
}
+
+out:
local_irq_restore(flags);
- put_cpu();
}
-void fastcall free_hot_page(struct page *page)
-{
- free_hot_cold_page(page, 0);
-}
-
-void fastcall free_cold_page(struct page *page)
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, bool cold)
{
- free_hot_cold_page(page, 1);
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ trace_mm_page_free_batched(page, cold);
+ free_hot_cold_page(page, cold);
+ }
}
/*
@@ -825,91 +1466,269 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON(PageCompound(page));
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * Split shadow pages too, because free(page[0]) would
+ * otherwise free the whole shadow.
+ */
+ if (kmemcheck_page_is_tracked(page))
+ split_page(virt_to_page(page[0].shadow), order);
+#endif
+
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
}
+EXPORT_SYMBOL_GPL(split_page);
+
+static int __isolate_free_page(struct page *page, unsigned int order)
+{
+ unsigned long watermark;
+ struct zone *zone;
+ int mt;
+
+ BUG_ON(!PageBuddy(page));
+
+ zone = page_zone(page);
+ mt = get_pageblock_migratetype(page);
+
+ if (!is_migrate_isolate(mt)) {
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+ }
+
+ /* Remove page from free list */
+ list_del(&page->lru);
+ zone->free_area[order].nr_free--;
+ rmv_page_order(page);
+
+ /* Set the pageblock if the isolated page is at least a pageblock */
+ if (order >= pageblock_order - 1) {
+ struct page *endpage = page + (1 << order) - 1;
+ for (; page < endpage; page += pageblock_nr_pages) {
+ int mt = get_pageblock_migratetype(page);
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ set_pageblock_migratetype(page,
+ MIGRATE_MOVABLE);
+ }
+ }
+
+ return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ int nr_pages;
+
+ order = page_order(page);
+
+ nr_pages = __isolate_free_page(page, order);
+ if (!nr_pages)
+ return 0;
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+ return nr_pages;
+}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *buffered_rmqueue(struct zonelist *zonelist,
- struct zone *zone, int order, gfp_t gfp_flags)
+static inline
+struct page *buffered_rmqueue(struct zone *preferred_zone,
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
{
unsigned long flags;
struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
- int cpu;
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
again:
- cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
+ struct list_head *list;
- pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
- if (!pcp->count) {
+ pcp = &this_cpu_ptr(zone->pageset)->pcp;
+ list = &pcp->lists[migratetype];
+ if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
- if (unlikely(!pcp->count))
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
goto failed;
}
- page = list_entry(pcp->list.next, struct page, lru);
+
+ if (cold)
+ page = list_entry(list->prev, struct page, lru);
+ else
+ page = list_entry(list->next, struct page, lru);
+
list_del(&page->lru);
pcp->count--;
} else {
+ if (unlikely(gfp_flags & __GFP_NOFAIL)) {
+ /*
+ * __GFP_NOFAIL is not to be used in new code.
+ *
+ * All __GFP_NOFAIL callers should be fixed so that they
+ * properly detect and handle allocation failures.
+ *
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with
+ * __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE(order > 1);
+ }
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_freepage_migratetype(page));
}
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(zonelist, zone);
+ zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
- put_cpu();
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
- put_cpu();
return NULL;
}
-#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct {
+ struct fault_attr attr;
+
+ u32 ignore_gfp_highmem;
+ u32 ignore_gfp_wait;
+ u32 min_order;
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .ignore_gfp_highmem = 1,
+ .min_order = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (order < fail_page_alloc.min_order)
+ return false;
+ if (gfp_mask & __GFP_NOFAIL)
+ return false;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return false;
+ if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ return false;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
+ &fail_page_alloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem))
+ goto fail;
+ if (!debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order))
+ goto fail;
+
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
+
+ return -ENOMEM;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return false;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags,
+ long free_pages)
{
/* free_pages my go negative - that's OK */
- long min = mark, free_pages = z->free_pages - (1 << order) + 1;
+ long min = mark;
+ long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
+ long free_cma = 0;
+ free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ if (free_pages - free_cma <= min + lowmem_reserve)
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -918,263 +1737,1115 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
+ }
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full. See further
+ * comments in mmzone.h. Reduces cache footprint of zonelist scans
+ * that have to skip over a lot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_states[N_MEMORY].)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ nodemask_t *allowednodes; /* zonelist_cache approximation */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return NULL;
+
+ if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ zlc->last_full_zap = jiffies;
}
+
+ allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+ &cpuset_current_mems_allowed :
+ &node_states[N_MEMORY];
+ return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ * 1) Check that the zone isn't thought to be full (doesn't have its
+ * bit set in the zonelist_cache fullzones BITMAP).
+ * 2) Check that the zones node (obtained from the zonelist_cache
+ * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
+ nodemask_t *allowednodes)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+ int n; /* node that zone *z is on */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return 1;
+
+ i = z - zonelist->_zonerefs;
+ n = zlc->z_to_n[i];
+
+ /* This zone is worth trying if it is allowed but not full */
+ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ i = z - zonelist->_zonerefs;
+
+ set_bit(i, zlc->fullzones);
+}
+
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+ return local_zone->node == zone->node;
+}
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+ RECLAIM_DISTANCE;
+}
+
+#else /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
+ nodemask_t *allowednodes)
+{
return 1;
}
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
+{
+}
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
+
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+
+#endif /* CONFIG_NUMA */
+
/*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, int alloc_flags)
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+ struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+ struct zone *preferred_zone, int classzone_idx, int migratetype)
{
- struct zone **z = zonelist->zones;
+ struct zoneref *z;
struct page *page = NULL;
- int classzone_idx = zone_idx(*z);
struct zone *zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE);
+zonelist_scan:
/*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Scan zonelist, looking for a zone with enough free.
+ * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
*/
- do {
- zone = *z;
- if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
- zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
- break;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask) {
+ unsigned long mark;
+
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ continue;
+ /*
+ * Distribute pages in proportion to the individual
+ * zone size to ensure fair page aging. The zone a
+ * page was allocated in should have no effect on the
+ * time the page has in memory before being reclaimed.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ if (!zone_local(preferred_zone, zone))
+ continue;
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+ continue;
+ }
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if (consider_zone_dirty && !zone_dirty_ok(zone))
continue;
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
- if (alloc_flags & ALLOC_WMARK_MIN)
- mark = zone->pages_min;
- else if (alloc_flags & ALLOC_WMARK_LOW)
- mark = zone->pages_low;
- else
- mark = zone->pages_high;
- if (!zone_watermark_ok(zone , order, mark,
- classzone_idx, alloc_flags))
- if (!zone_reclaim_mode ||
- !zone_reclaim(zone, gfp_mask, order))
- continue;
+ mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
+ int ret;
+
+ /* Checked here to keep the fast path fast */
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ goto try_this_zone;
+
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+ * we do zlc_setup if there are multiple nodes
+ * and before considering the first zone allowed
+ * by the cpuset.
+ */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+
+ if (zone_reclaim_mode == 0 ||
+ !zone_allows_reclaim(preferred_zone, zone))
+ goto this_zone_full;
+
+ /*
+ * As we may have just activated ZLC, check if the first
+ * eligible zone has failed zone_reclaim recently.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+
+ ret = zone_reclaim(zone, gfp_mask, order);
+ switch (ret) {
+ case ZONE_RECLAIM_NOSCAN:
+ /* did not scan */
+ continue;
+ case ZONE_RECLAIM_FULL:
+ /* scanned but unreclaimable */
+ continue;
+ default:
+ /* did we reclaim enough */
+ if (zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags))
+ goto try_this_zone;
+
+ /*
+ * Failed to reclaim enough to meet watermark.
+ * Only mark the zone full if checking the min
+ * watermark or if we failed to reclaim just
+ * 1<<order pages or else the page allocator
+ * fastpath will prematurely mark zones full
+ * when the watermark is between the low and
+ * min watermarks.
+ */
+ if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
+ ret == ZONE_RECLAIM_SOME)
+ goto this_zone_full;
+
+ continue;
+ }
}
- page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
- if (page) {
+try_this_zone:
+ page = buffered_rmqueue(preferred_zone, zone, order,
+ gfp_mask, migratetype);
+ if (page)
break;
- }
- } while (*(++z) != NULL);
+this_zone_full:
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
+ zlc_mark_zone_full(zonelist, z);
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ goto zonelist_scan;
+ }
+
+ if (page)
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ * necessary to allocate the page. The expectation is
+ * that the caller is taking steps that will free more
+ * memory. The caller should avoid the page being used
+ * for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+
return page;
}
/*
- * This is the 'heart' of the zoned buddy allocator.
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
*/
-struct page * fastcall
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
+static inline bool should_suppress_show_mem(void)
+{
+ bool ret = false;
+
+#if NODES_SHIFT > 8
+ ret = in_interrupt();
+#endif
+ return ret;
+}
+
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+ debug_guardpage_minorder() > 0)
+ return;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (test_thread_flag(TIF_MEMDIE) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ if (fmt) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("%pV", &vaf);
+
+ va_end(args);
+ }
+
+ pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
+
+ dump_stack();
+ if (!should_suppress_show_mem())
+ show_mem(filter);
+}
+
+static inline int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+ unsigned long did_some_progress,
+ unsigned long pages_reclaimed)
+{
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ return 0;
+
+ /* Always retry if specifically requested */
+ if (gfp_mask & __GFP_NOFAIL)
+ return 1;
+
+ /*
+ * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+ * making forward progress without invoking OOM. Suspend also disables
+ * storage devices so kswapd will not help. Bail if we are suspending.
+ */
+ if (!did_some_progress && pm_suspended_storage())
+ return 0;
+
+ /*
+ * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+ * means __GFP_NOFAIL, but that may not be true in other
+ * implementations.
+ */
+ if (order <= PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
+
+ /*
+ * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+ * specified, then we retry until we no longer reclaim any pages
+ * (above), or we've reclaimed an order of pages at least as
+ * large as the allocation's order. In both cases, if the
+ * allocation still fails, we stop retrying.
+ */
+ if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+ return 1;
+
+ return 0;
+}
+
+static inline struct page *
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int classzone_idx, int migratetype)
{
- const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct zone **z;
struct page *page;
+
+ /* Acquire the OOM killer lock for the zones in zonelist */
+ if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
+ schedule_timeout_uninterruptible(1);
+ return NULL;
+ }
+
+ /*
+ * Go through the zonelist yet one more time, keep very high watermark
+ * here, this is only to catch a parallel oom killing, we must fail if
+ * we're still under heavy pressure.
+ */
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+ order, zonelist, high_zoneidx,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET,
+ preferred_zone, classzone_idx, migratetype);
+ if (page)
+ goto out;
+
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto out;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto out;
+ /*
+ * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
+ * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
+ * The caller should handle page allocation failure by itself if
+ * it specifies __GFP_THISNODE.
+ * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
+ */
+ if (gfp_mask & __GFP_THISNODE)
+ goto out;
+ }
+ /* Exhausted what can be done so it's blamo time */
+ out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+
+out:
+ clear_zonelist_oom(zonelist, gfp_mask);
+ return page;
+}
+
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int classzone_idx, int migratetype, enum migrate_mode mode,
+ bool *contended_compaction, bool *deferred_compaction,
+ unsigned long *did_some_progress)
+{
+ if (!order)
+ return NULL;
+
+ if (compaction_deferred(preferred_zone, order)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
+ current->flags |= PF_MEMALLOC;
+ *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ nodemask, mode,
+ contended_compaction);
+ current->flags &= ~PF_MEMALLOC;
+
+ if (*did_some_progress != COMPACT_SKIPPED) {
+ struct page *page;
+
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
+
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx, migratetype);
+ if (page) {
+ preferred_zone->compact_blockskip_flush = false;
+ compaction_defer_reset(preferred_zone, order, true);
+ count_vm_event(COMPACTSUCCESS);
+ return page;
+ }
+
+ /*
+ * It's bad if compaction run occurs and fails.
+ * The most likely reason is that pages exist,
+ * but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (mode != MIGRATE_ASYNC)
+ defer_compaction(preferred_zone, order);
+
+ cond_resched();
+ }
+
+ return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int classzone_idx, int migratetype,
+ enum migrate_mode mode, bool *contended_compaction,
+ bool *deferred_compaction, unsigned long *did_some_progress)
+{
+ return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
+/* Perform direct synchronous page reclaim */
+static int
+__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
- int do_retry;
- int alloc_flags;
- int did_some_progress;
+ int progress;
- might_sleep_if(wait);
+ cond_resched();
-restart:
- z = zonelist->zones; /* the list of zones suitable for gfp_mask */
+ /* We now go into synchronous reclaim */
+ cpuset_memory_pressure_bump();
+ current->flags |= PF_MEMALLOC;
+ lockdep_set_current_reclaim_state(gfp_mask);
+ reclaim_state.reclaimed_slab = 0;
+ current->reclaim_state = &reclaim_state;
+
+ progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+
+ current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+ current->flags &= ~PF_MEMALLOC;
- if (unlikely(*z == NULL)) {
- /* Should this ever happen?? */
+ cond_resched();
+
+ return progress;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
+{
+ struct page *page = NULL;
+ bool drained = false;
+
+ *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+ nodemask);
+ if (unlikely(!(*did_some_progress)))
return NULL;
+
+ /* After successful reclaim, reconsider all zones for allocation */
+ if (IS_ENABLED(CONFIG_NUMA))
+ zlc_clear_zones_full(zonelist);
+
+retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx,
+ migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists. Drain them and try again
+ */
+ if (!page && !drained) {
+ drain_all_pages();
+ drained = true;
+ goto retry;
}
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
- if (page)
- goto got_pg;
+ return page;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int classzone_idx, int migratetype)
+{
+ struct page *page;
do {
- wakeup_kswapd(*z, order);
- } while (*(++z));
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
+ zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx, migratetype);
+
+ if (!page && gfp_mask & __GFP_NOFAIL)
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+ } while (!page && (gfp_mask & __GFP_NOFAIL));
+
+ return page;
+}
+
+static void reset_alloc_batches(struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ /*
+ * Only reset the batches of zones that were actually
+ * considered in the fairness pass, we don't want to
+ * trash fairness information for zones that are not
+ * actually part of this zonelist's round-robin cycle.
+ */
+ if (!zone_local(preferred_zone, zone))
+ continue;
+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+ }
+}
+
+static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
+static inline int
+gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+ int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
+
+ /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
- * OK, we're below the kswapd watermark and have kicked background
- * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed.
- *
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags = ALLOC_WMARK_MIN;
- if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
+
+ if (atomic) {
+ /*
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
+ /*
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
+ */
+ alloc_flags &= ~ALLOC_CPUSET;
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (gfp_mask & __GFP_HIGH)
- alloc_flags |= ALLOC_HIGH;
- if (wait)
- alloc_flags |= ALLOC_CPUSET;
+
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+ if (gfp_mask & __GFP_MEMALLOC)
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ }
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
+ return alloc_flags;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
+
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, enum zone_type high_zoneidx,
+ nodemask_t *nodemask, struct zone *preferred_zone,
+ int classzone_idx, int migratetype)
+{
+ const gfp_t wait = gfp_mask & __GFP_WAIT;
+ struct page *page = NULL;
+ int alloc_flags;
+ unsigned long pages_reclaimed = 0;
+ unsigned long did_some_progress;
+ enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ bool deferred_compaction = false;
+ bool contended_compaction = false;
/*
- * Go through the zonelist again. Let __GFP_HIGH and allocations
- * coming from realtime tasks go deeper into reserves.
- *
- * This is the last chance, in general, before the goto nopage.
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * In the slowpath, we sanity check order to avoid ever trying to
+ * reclaim >= MAX_ORDER areas which will never succeed. Callers may
+ * be using allocators in order of preference for an area that is
+ * too large.
+ */
+ if (order >= MAX_ORDER) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+ * __GFP_NOWARN set) should not cause reclaim since the subsystem
+ * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+ * using a larger set of nodes after it has established that the
+ * allowed per node queues are empty and that nodes are
+ * over allocated.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ goto nopage;
+
+restart:
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+
+ /*
+ * OK, we're below the kswapd watermark and have kicked background
+ * reclaim. Now things get more complex, so set up alloc_flags according
+ * to how we want to proceed.
*/
- page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
+ /*
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ struct zoneref *preferred_zoneref;
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ NULL, &preferred_zone);
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ }
+
+rebalance:
+ /* This is the last chance, in general, before the goto nopage. */
+ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+ high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
- /* This allocation should allow future memory freeing. */
+ /* Allocate without watermarks if the context allows */
+ if (alloc_flags & ALLOC_NO_WATERMARKS) {
+ /*
+ * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+ * the allocation is high priority and these type of
+ * allocations are system rather than user orientated
+ */
+ zonelist = node_zonelist(numa_node_id(), gfp_mask);
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
- && !in_interrupt()) {
- if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
- /* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, ALLOC_NO_WATERMARKS);
- if (page)
- goto got_pg;
- if (gfp_mask & __GFP_NOFAIL) {
- blk_congestion_wait(WRITE, HZ/50);
- goto nofail_alloc;
- }
+ page = __alloc_pages_high_priority(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask,
+ preferred_zone, classzone_idx, migratetype);
+ if (page) {
+ goto got_pg;
}
- goto nopage;
}
/* Atomic allocations - we can't balance anything */
- if (!wait)
+ if (!wait) {
+ /*
+ * All existing users of the deprecated __GFP_NOFAIL are
+ * blockable, so warn of any new users that actually allow this
+ * type of allocation to fail.
+ */
+ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
+ }
-rebalance:
- cond_resched();
-
- /* We now go into synchronous reclaim */
- cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ /* Avoid recursion of direct reclaim */
+ if (current->flags & PF_MEMALLOC)
+ goto nopage;
- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+ goto nopage;
- p->reclaim_state = NULL;
- p->flags &= ~PF_MEMALLOC;
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
+ &deferred_compaction,
+ &did_some_progress);
+ if (page)
+ goto got_pg;
- cond_resched();
+ /*
+ * It can become very expensive to allocate transparent hugepages at
+ * fault, so use asynchronous memory compaction for THP unless it is
+ * khugepaged trying to collapse.
+ */
+ if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+ migration_mode = MIGRATE_SYNC_LIGHT;
- if (likely(did_some_progress)) {
- page = get_page_from_freelist(gfp_mask, order,
- zonelist, alloc_flags);
- if (page)
- goto got_pg;
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- /*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
- */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
- if (page)
- goto got_pg;
+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * requested a movable allocation that does not heavily disrupt the
+ * system then fail the allocation instead of entering direct reclaim.
+ */
+ if ((deferred_compaction || contended_compaction) &&
+ (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
- out_of_memory(zonelist, gfp_mask, order);
- goto restart;
- }
+ /* Try direct reclaim and then allocating */
+ page = __alloc_pages_direct_reclaim(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ classzone_idx, migratetype,
+ &did_some_progress);
+ if (page)
+ goto got_pg;
/*
- * Don't let big-order allocations loop unless the caller explicitly
- * requests that. Wait for some write requests to complete then retry.
- *
- * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
- * <= 3, but that may not be true in other implementations.
+ * If we failed to make any progress reclaiming, then we are
+ * running out of options and have to consider going OOM
*/
- do_retry = 0;
- if (!(gfp_mask & __GFP_NORETRY)) {
- if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
- do_retry = 1;
- if (gfp_mask & __GFP_NOFAIL)
- do_retry = 1;
- }
- if (do_retry) {
- blk_congestion_wait(WRITE, HZ/50);
+ if (!did_some_progress) {
+ if (oom_gfp_allowed(gfp_mask)) {
+ if (oom_killer_disabled)
+ goto nopage;
+ /* Coredumps can quickly deplete all memory reserves */
+ if ((current->flags & PF_DUMPCORE) &&
+ !(gfp_mask & __GFP_NOFAIL))
+ goto nopage;
+ page = __alloc_pages_may_oom(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask, preferred_zone,
+ classzone_idx, migratetype);
+ if (page)
+ goto got_pg;
+
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ /*
+ * The oom killer is not called for high-order
+ * allocations that may fail, so if no progress
+ * is being made, there are no other options and
+ * retrying is unlikely to help.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ goto nopage;
+ /*
+ * The oom killer is not called for lowmem
+ * allocations to prevent needlessly killing
+ * innocent tasks.
+ */
+ if (high_zoneidx < ZONE_NORMAL)
+ goto nopage;
+ }
+
+ goto restart;
+ }
+ }
+
+ /* Check if we should retry the allocation */
+ pages_reclaimed += did_some_progress;
+ if (should_alloc_retry(gfp_mask, order, did_some_progress,
+ pages_reclaimed)) {
+ /* Wait for some write requests to complete then retry */
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
+ &deferred_compaction,
+ &did_some_progress);
+ if (page)
+ goto got_pg;
}
nopage:
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- printk(KERN_WARNING "%s: page allocation failure."
- " order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
- dump_stack();
- show_mem();
- }
+ warn_alloc_failed(gfp_mask, order, NULL);
+ return page;
got_pg:
+ if (kmemcheck_enabled)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
return page;
}
-EXPORT_SYMBOL(__alloc_pages);
-
/*
- * Common helper functions.
+ * This is the 'heart' of the zoned buddy allocator.
*/
-fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, nodemask_t *nodemask)
{
- struct page * page;
- page = alloc_pages(gfp_mask, order);
- if (!page)
- return 0;
- return (unsigned long) page_address(page);
-}
+ enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ struct zone *preferred_zone;
+ struct zoneref *preferred_zoneref;
+ struct page *page = NULL;
+ int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+ int classzone_idx;
-EXPORT_SYMBOL(__get_free_pages);
+ gfp_mask &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(gfp_mask);
+
+ might_sleep_if(gfp_mask & __GFP_WAIT);
+
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
+ /*
+ * Check the zones suitable for the gfp_mask contain at least one
+ * valid zone. It's possible to have an empty zonelist as a result
+ * of GFP_THISNODE and a memoryless node
+ */
+ if (unlikely(!zonelist->_zonerefs->zone))
+ return NULL;
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
+ /* The preferred zone is used for statistics later */
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
+ if (!preferred_zone)
+ goto out;
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
+retry:
+ /* First allocation attempt */
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+ zonelist, high_zoneidx, alloc_flags,
+ preferred_zone, classzone_idx, migratetype);
+ if (unlikely(!page)) {
+ /*
+ * The first pass makes sure allocations are spread
+ * fairly within the local node. However, the local
+ * node might have free pages left after the fairness
+ * batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without
+ * fairness, and include remote zones now, before
+ * entering the slowpath and waking kswapd: prefer
+ * spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ reset_alloc_batches(zonelist, high_zoneidx,
+ preferred_zone);
+ alloc_flags &= ~ALLOC_FAIR;
+ goto retry;
+ }
+ /*
+ * Runtime PM, block IO and its error handling path
+ * can deadlock because I/O on the device might not
+ * complete.
+ */
+ gfp_mask = memalloc_noio_flags(gfp_mask);
+ page = __alloc_pages_slowpath(gfp_mask, order,
+ zonelist, high_zoneidx, nodemask,
+ preferred_zone, classzone_idx, migratetype);
+ }
+
+ trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
-fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+ return page;
+}
+EXPORT_SYMBOL(__alloc_pages_nodemask);
+
+/*
+ * Common helper functions.
+ */
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
- struct page * page;
+ struct page *page;
/*
- * get_zeroed_page() returns a 32-bit address, which cannot represent
+ * __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
- page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
- if (page)
- return (unsigned long) page_address(page);
- return 0;
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
}
+EXPORT_SYMBOL(__get_free_pages);
-EXPORT_SYMBOL(get_zeroed_page);
-
-void __pagevec_free(struct pagevec *pvec)
+unsigned long get_zeroed_page(gfp_t gfp_mask)
{
- int i = pagevec_count(pvec);
-
- while (--i >= 0)
- free_hot_cold_page(pvec->pages[i], pvec->cold);
+ return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
+EXPORT_SYMBOL(get_zeroed_page);
-fastcall void __free_pages(struct page *page, unsigned int order)
+void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_page(page);
+ free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
@@ -1182,7 +2853,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
EXPORT_SYMBOL(__free_pages);
-fastcall void free_pages(unsigned long addr, unsigned int order)
+void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
@@ -1193,47 +2864,155 @@ fastcall void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
- * Total amount of free (allocatable) RAM:
+ * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
+ * of the current memory cgroup.
+ *
+ * It should be used when the caller would like to use kmalloc, but since the
+ * allocation is large, it has to fall back to the page allocator.
*/
-unsigned int nr_free_pages(void)
+struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
{
- unsigned int sum = 0;
- struct zone *zone;
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
- for_each_zone(zone)
- sum += zone->free_pages;
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages(gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
+}
- return sum;
+struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages_node(nid, gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
}
-EXPORT_SYMBOL(nr_free_pages);
+/*
+ * __free_kmem_pages and free_kmem_pages will free pages allocated with
+ * alloc_kmem_pages.
+ */
+void __free_kmem_pages(struct page *page, unsigned int order)
+{
+ memcg_kmem_uncharge_pages(page, order);
+ __free_pages(page, order);
+}
-#ifdef CONFIG_NUMA
-unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+void free_kmem_pages(unsigned long addr, unsigned int order)
{
- unsigned int sum = 0;
- enum zone_type i;
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_kmem_pages(virt_to_page((void *)addr), order);
+ }
+}
- for (i = 0; i < MAX_NR_ZONES; i++)
- sum += pgdat->node_zones[i].free_pages;
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page((void *)addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ return (void *)addr;
+}
- return sum;
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request. alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+ unsigned int order = get_order(size);
+ unsigned long addr;
+
+ addr = __get_free_pages(gfp_mask, order);
+ return make_alloc_exact(addr, order, size);
}
-#endif
+EXPORT_SYMBOL(alloc_pages_exact);
-static unsigned int nr_free_zone_pages(int offset)
+/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ * pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
- /* Just pick one node, since fallback list is circular */
- pg_data_t *pgdat = NODE_DATA(numa_node_id());
- unsigned int sum = 0;
+ unsigned order = get_order(size);
+ struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ if (!p)
+ return NULL;
+ return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+ unsigned long addr = (unsigned long)virt;
+ unsigned long end = addr + PAGE_ALIGN(size);
- struct zonelist *zonelist = pgdat->node_zonelists + offset;
- struct zone **zonep = zonelist->zones;
+ while (addr < end) {
+ free_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
+EXPORT_SYMBOL(free_pages_exact);
+
+/**
+ * nr_free_zone_pages - count number of pages beyond high watermark
+ * @offset: The zone index of the highest zone
+ *
+ * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * high watermark within all zones at or below a given zone index. For each
+ * zone, the number of pages is calculated as:
+ * managed_pages - high_pages
+ */
+static unsigned long nr_free_zone_pages(int offset)
+{
+ struct zoneref *z;
struct zone *zone;
- for (zone = *zonep++; zone; zone = *zonep++) {
- unsigned long size = zone->present_pages;
- unsigned long high = zone->pages_high;
+ /* Just pick one node, since fallback list is circular */
+ unsigned long sum = 0;
+
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
+
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
+ unsigned long size = zone->managed_pages;
+ unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
}
@@ -1241,33 +3020,40 @@ static unsigned int nr_free_zone_pages(int offset)
return sum;
}
-/*
- * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+/**
+ * nr_free_buffer_pages - count number of pages beyond high watermark
+ *
+ * nr_free_buffer_pages() counts the number of pages which are beyond the high
+ * watermark within ZONE_DMA and ZONE_NORMAL.
*/
-unsigned int nr_free_buffer_pages(void)
+unsigned long nr_free_buffer_pages(void)
{
return nr_free_zone_pages(gfp_zone(GFP_USER));
}
+EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/*
- * Amount of free RAM allocatable within all zones
+/**
+ * nr_free_pagecache_pages - count number of pages beyond high watermark
+ *
+ * nr_free_pagecache_pages() counts the number of pages which are beyond the
+ * high watermark within all zones.
*/
-unsigned int nr_free_pagecache_pages(void)
+unsigned long nr_free_pagecache_pages(void)
{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
+ return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
}
static inline void show_node(struct zone *zone)
{
- if (NUMA_BUILD)
- printk("Node %ld ", zone_to_nid(zone));
+ if (IS_ENABLED(CONFIG_NUMA))
+ printk("Node %d ", zone_to_nid(zone));
}
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = 0;
- val->freeram = nr_free_pages();
+ val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -1279,13 +3065,18 @@ EXPORT_SYMBOL(si_meminfo);
#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
+ int zone_type; /* needs to be signed */
+ unsigned long managed_pages = 0;
pg_data_t *pgdat = NODE_DATA(nid);
- val->totalram = pgdat->node_present_pages;
- val->freeram = nr_free_pages_pgdat(pgdat);
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+ managed_pages += pgdat->node_zones[zone_type].managed_pages;
+ val->totalram = managed_pages;
+ val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
- val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
- val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+ val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
+ val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
+ NR_FREE_PAGES);
#else
val->totalhigh = 0;
val->freehigh = 0;
@@ -1294,85 +3085,176 @@ void si_meminfo_node(struct sysinfo *val, int nid)
}
#endif
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+ bool ret = false;
+ unsigned int cpuset_mems_cookie;
+
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ goto out;
+
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+out:
+ return ret;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RESERVE] = 'R',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ [MIGRATE_ISOLATE] = 'I',
+#endif
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk("(%s) ", tmp);
+}
+
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
*/
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
{
int cpu;
- unsigned long active;
- unsigned long inactive;
- unsigned long free;
struct zone *zone;
- for_each_zone(zone) {
- if (!populated_zone(zone))
+ for_each_populated_zone(zone) {
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
-
show_node(zone);
printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
- pageset = zone_pcp(zone, cpu);
+ pageset = per_cpu_ptr(zone->pageset, cpu);
- printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
- "Cold: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp[0].high,
- pageset->pcp[0].batch, pageset->pcp[0].count,
- pageset->pcp[1].high, pageset->pcp[1].batch,
- pageset->pcp[1].count);
+ printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp.high,
+ pageset->pcp.batch, pageset->pcp.count);
}
}
- get_zone_counts(&active, &inactive, &free);
-
- printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
- active,
- inactive,
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
+ " unevictable:%lu"
+ " dirty:%lu writeback:%lu unstable:%lu\n"
+ " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " free_cma:%lu\n",
+ global_page_state(NR_ACTIVE_ANON),
+ global_page_state(NR_INACTIVE_ANON),
+ global_page_state(NR_ISOLATED_ANON),
+ global_page_state(NR_ACTIVE_FILE),
+ global_page_state(NR_INACTIVE_FILE),
+ global_page_state(NR_ISOLATED_FILE),
+ global_page_state(NR_UNEVICTABLE),
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- nr_free_pages(),
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_page_state(NR_FREE_PAGES),
+ global_page_state(NR_SLAB_RECLAIMABLE),
+ global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
- global_page_state(NR_PAGETABLE));
+ global_page_state(NR_SHMEM),
+ global_page_state(NR_PAGETABLE),
+ global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_CMA_PAGES));
- for_each_zone(zone) {
+ for_each_populated_zone(zone) {
int i;
- if (!populated_zone(zone))
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
-
show_node(zone);
printk("%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
- " active:%lukB"
- " inactive:%lukB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
" present:%lukB"
+ " managed:%lukB"
+ " mlocked:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " mapped:%lukB"
+ " shmem:%lukB"
+ " slab_reclaimable:%lukB"
+ " slab_unreclaimable:%lukB"
+ " kernel_stack:%lukB"
+ " pagetables:%lukB"
+ " unstable:%lukB"
+ " bounce:%lukB"
+ " free_cma:%lukB"
+ " writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone->free_pages),
- K(zone->pages_min),
- K(zone->pages_low),
- K(zone->pages_high),
- K(zone->nr_active),
- K(zone->nr_inactive),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(min_wmark_pages(zone)),
+ K(low_wmark_pages(zone)),
+ K(high_wmark_pages(zone)),
+ K(zone_page_state(zone, NR_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ISOLATED_ANON)),
+ K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
+ K(zone->managed_pages),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_FILE_DIRTY)),
+ K(zone_page_state(zone, NR_WRITEBACK)),
+ K(zone_page_state(zone, NR_FILE_MAPPED)),
+ K(zone_page_state(zone, NR_SHMEM)),
+ K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
+ K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
+ zone_page_state(zone, NR_KERNEL_STACK) *
+ THREAD_SIZE / 1024,
+ K(zone_page_state(zone, NR_PAGETABLE)),
+ K(zone_page_state(zone, NR_UNSTABLE_NFS)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
+ K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
- (zone->all_unreclaimable ? "yes" : "no")
+ (!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1380,57 +3262,191 @@ void show_free_areas(void)
printk("\n");
}
- for_each_zone(zone) {
- unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ for_each_populated_zone(zone) {
+ unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned char types[MAX_ORDER];
- if (!populated_zone(zone))
+ if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
-
show_node(zone);
printk("%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr[order] = zone->free_area[order].nr_free;
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!list_empty(&area->free_list[type]))
+ types[order] |= 1 << type;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++)
+ for (order = 0; order < MAX_ORDER; order++) {
printk("%lu*%lukB ", nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
printk("= %lukB\n", K(total));
}
+ hugetlb_show_meminfo();
+
+ printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+
show_swap_cache_info();
}
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+ zoneref->zone = zone;
+ zoneref->zone_idx = zone_idx(zone);
+}
+
/*
* Builds allocation fallback zone lists.
*
* Add all populated zones of a node to the zonelist.
*/
-static int __meminit build_zonelists_node(pg_data_t *pgdat,
- struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
+static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
+ int nr_zones)
{
struct zone *zone;
-
- BUG_ON(zone_type >= MAX_NR_ZONES);
- zone_type++;
+ enum zone_type zone_type = MAX_NR_ZONES;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
- zonelist->zones[nr_zones++] = zone;
+ zoneref_set_zone(zone,
+ &zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
-
} while (zone_type);
+
return nr_zones;
}
+
+/*
+ * zonelist_order:
+ * 0 = automatic detection of better ordering.
+ * 1 = order by ([node] distance, -zonetype)
+ * 2 = order by (-zonetype, [node] distance)
+ *
+ * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
+ * the same zonelist. So only NUMA can configure this param.
+ */
+#define ZONELIST_ORDER_DEFAULT 0
+#define ZONELIST_ORDER_NODE 1
+#define ZONELIST_ORDER_ZONE 2
+
+/* zonelist order in the kernel.
+ * set_zonelist_order() will set this to NODE or ZONE.
+ */
+static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
+static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
+
+
#ifdef CONFIG_NUMA
-#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+/* The value user specified ....changed by config */
+static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+/* string for sysctl */
+#define NUMA_ZONELIST_ORDER_LEN 16
+char numa_zonelist_order[16] = "default";
+
+/*
+ * interface for configure zonelist ordering.
+ * command line option "numa_zonelist_order"
+ * = "[dD]efault - default, automatic configuration.
+ * = "[nN]ode - order by node locality, then by zone within node
+ * = "[zZ]one - order by zone, then by locality within zone
+ */
+
+static int __parse_numa_zonelist_order(char *s)
+{
+ if (*s == 'd' || *s == 'D') {
+ user_zonelist_order = ZONELIST_ORDER_DEFAULT;
+ } else if (*s == 'n' || *s == 'N') {
+ user_zonelist_order = ZONELIST_ORDER_NODE;
+ } else if (*s == 'z' || *s == 'Z') {
+ user_zonelist_order = ZONELIST_ORDER_ZONE;
+ } else {
+ printk(KERN_WARNING
+ "Ignoring invalid numa_zonelist_order value: "
+ "%s\n", s);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static __init int setup_numa_zonelist_order(char *s)
+{
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
+}
+early_param("numa_zonelist_order", setup_numa_zonelist_order);
+
+/*
+ * sysctl handler for numa_zonelist_order
+ */
+int numa_zonelist_order_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ int ret;
+ static DEFINE_MUTEX(zl_order_mutex);
+
+ mutex_lock(&zl_order_mutex);
+ if (write) {
+ if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
+ ret = -EINVAL;
+ goto out;
+ }
+ strcpy(saved_string, (char *)table->data);
+ }
+ ret = proc_dostring(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
+ if (write) {
+ int oldval = user_zonelist_order;
+
+ ret = __parse_numa_zonelist_order((char *)table->data);
+ if (ret) {
+ /*
+ * bogus value. restore saved string
+ */
+ strncpy((char *)table->data, saved_string,
+ NUMA_ZONELIST_ORDER_LEN);
+ user_zonelist_order = oldval;
+ } else if (oldval != user_zonelist_order) {
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ }
+ }
+out:
+ mutex_unlock(&zl_order_mutex);
+ return ret;
+}
+
+
+#define MAX_NODE_LOAD (nr_online_nodes)
+static int node_load[MAX_NUMNODES];
+
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
@@ -1445,11 +3461,12 @@ static int __meminitdata node_load[MAX_NUMNODES];
* on them otherwise.
* It returns -1 if no node is found.
*/
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
- int best_node = -1;
+ int best_node = NUMA_NO_NODE;
+ const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -1457,8 +3474,7 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
return node;
}
- for_each_online_node(n) {
- cpumask_t tmp;
+ for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -1471,8 +3487,8 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = node_to_cpumask(n);
- if (!cpus_empty(tmp))
+ tmp = cpumask_of_node(n);
+ if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -1491,117 +3507,400 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
return best_node;
}
-static void __meminit build_zonelists(pg_data_t *pgdat)
+
+/*
+ * Build zonelists ordered by node and zones within node.
+ * This results in maximum locality--normal zone overflows into local
+ * DMA zone, if any--but risks exhausting DMA zone.
+ */
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
{
- int j, node, local_node;
- enum zone_type i;
- int prev_node, load;
+ int j;
+ struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[0];
+ for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
+ ;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+ int j;
struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[1];
+ j = build_zonelists_node(pgdat, zonelist, 0);
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/*
+ * Build zonelists ordered by zone and nodes within zones.
+ * This results in conserving DMA zone[s] until all Normal memory is
+ * exhausted, but results in overflowing to remote node while memory
+ * may still exist in local DMA zone.
+ */
+static int node_order[MAX_NUMNODES];
+
+static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
+{
+ int pos, j, node;
+ int zone_type; /* needs to be signed */
+ struct zone *z;
+ struct zonelist *zonelist;
+
+ zonelist = &pgdat->node_zonelists[0];
+ pos = 0;
+ for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
+ for (j = 0; j < nr_nodes; j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ zoneref_set_zone(z,
+ &zonelist->_zonerefs[pos++]);
+ check_highest_zone(zone_type);
+ }
+ }
+ }
+ zonelist->_zonerefs[pos].zone = NULL;
+ zonelist->_zonerefs[pos].zone_idx = 0;
+}
+
+static int default_zonelist_order(void)
+{
+ int nid, zone_type;
+ unsigned long low_kmem_size, total_size;
+ struct zone *z;
+ int average_size;
+ /*
+ * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
+ * If they are really small and used heavily, the system can fall
+ * into OOM very easily.
+ * This function detect ZONE_DMA/DMA32 size and configures zone order.
+ */
+ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
+ low_kmem_size = 0;
+ total_size = 0;
+ for_each_online_node(nid) {
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->managed_pages;
+ total_size += z->managed_pages;
+ } else if (zone_type == ZONE_NORMAL) {
+ /*
+ * If any node has only lowmem, then node order
+ * is preferred to allow kernel allocations
+ * locally; otherwise, they can easily infringe
+ * on other nodes when there is an abundance of
+ * lowmem available to allocate from.
+ */
+ return ZONELIST_ORDER_NODE;
+ }
+ }
+ }
+ if (!low_kmem_size || /* there are no DMA area. */
+ low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
+ return ZONELIST_ORDER_NODE;
+ /*
+ * look into each node's config.
+ * If there is a node whose DMA/DMA32 memory is very big area on
+ * local memory, NODE_ORDER may be suitable.
+ */
+ average_size = total_size /
+ (nodes_weight(node_states[N_MEMORY]) + 1);
+ for_each_online_node(nid) {
+ low_kmem_size = 0;
+ total_size = 0;
+ for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+ z = &NODE_DATA(nid)->node_zones[zone_type];
+ if (populated_zone(z)) {
+ if (zone_type < ZONE_NORMAL)
+ low_kmem_size += z->present_pages;
+ total_size += z->present_pages;
+ }
+ }
+ if (low_kmem_size &&
+ total_size > average_size && /* ignore small node */
+ low_kmem_size > total_size * 70/100)
+ return ZONELIST_ORDER_NODE;
+ }
+ return ZONELIST_ORDER_ZONE;
+}
+
+static void set_zonelist_order(void)
+{
+ if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
+ current_zonelist_order = default_zonelist_order();
+ else
+ current_zonelist_order = user_zonelist_order;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ int j, node, load;
+ enum zone_type i;
nodemask_t used_mask;
+ int local_node, prev_node;
+ struct zonelist *zonelist;
+ int order = current_zonelist_order;
/* initialize zonelists */
- for (i = 0; i < MAX_NR_ZONES; i++) {
+ for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist = pgdat->node_zonelists + i;
- zonelist->zones[0] = NULL;
+ zonelist->_zonerefs[0].zone = NULL;
+ zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
- load = num_online_nodes();
+ load = nr_online_nodes;
prev_node = local_node;
nodes_clear(used_mask);
- while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
- int distance = node_distance(local_node, node);
- /*
- * If another node is sufficiently far away then it is better
- * to reclaim pages in a zone before going off node.
- */
- if (distance > RECLAIM_DISTANCE)
- zone_reclaim_mode = 1;
+ memset(node_order, 0, sizeof(node_order));
+ j = 0;
+ while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
+ node_load[node] = load;
- if (distance != node_distance(local_node, prev_node))
- node_load[node] += load;
prev_node = node;
load--;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- zonelist = pgdat->node_zonelists + i;
- for (j = 0; zonelist->zones[j] != NULL; j++);
+ if (order == ZONELIST_ORDER_NODE)
+ build_zonelists_in_node_order(pgdat, node);
+ else
+ node_order[j++] = node; /* remember order */
+ }
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- zonelist->zones[j] = NULL;
- }
+ if (order == ZONELIST_ORDER_ZONE) {
+ /* calculate node order -- i.e., DMA last! */
+ build_zonelists_in_zone_order(pgdat, j);
}
+
+ build_thisnode_zonelists(pgdat);
+}
+
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void build_zonelist_cache(pg_data_t *pgdat)
+{
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zoneref *z;
+
+ zonelist = &pgdat->node_zonelists[0];
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->_zonerefs; z->zone; z++)
+ zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
+}
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+/*
+ * Return node id of node used for "local" allocations.
+ * I.e., first node id of first zone in arg node's generic zonelist.
+ * Used for initializing percpu 'numa_mem', which is used primarily
+ * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
+ */
+int local_memory_node(int node)
+{
+ struct zone *zone;
+
+ (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+ gfp_zone(GFP_KERNEL),
+ NULL,
+ &zone);
+ return zone->node;
}
+#endif
#else /* CONFIG_NUMA */
-static void __meminit build_zonelists(pg_data_t *pgdat)
+static void set_zonelist_order(void)
+{
+ current_zonelist_order = ZONELIST_ORDER_ZONE;
+}
+
+static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type i,j;
+ enum zone_type j;
+ struct zonelist *zonelist;
local_node = pgdat->node_id;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zonelist *zonelist;
- zonelist = pgdat->node_zonelists + i;
+ zonelist = &pgdat->node_zonelists[0];
+ j = build_zonelists_node(pgdat, zonelist, 0);
- j = build_zonelists_node(pgdat, zonelist, 0, i);
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don't want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- */
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
- }
-
- zonelist->zones[j] = NULL;
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
}
+ for (node = 0; node < local_node; node++) {
+ if (!node_online(node))
+ continue;
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ }
+
+ zonelist->_zonerefs[j].zone = NULL;
+ zonelist->_zonerefs[j].zone_idx = 0;
+}
+
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void build_zonelist_cache(pg_data_t *pgdat)
+{
+ pgdat->node_zonelists[0].zlcache_ptr = NULL;
}
#endif /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
+
+/* return values int ....just for stop_machine() */
+static int __build_all_zonelists(void *data)
{
int nid;
- for_each_online_node(nid)
- build_zonelists(NODE_DATA(nid));
+ int cpu;
+ pg_data_t *self = data;
+
+#ifdef CONFIG_NUMA
+ memset(node_load, 0, sizeof(node_load));
+#endif
+
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ build_zonelist_cache(self);
+ }
+
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ build_zonelists(pgdat);
+ build_zonelist_cache(pgdat);
+ }
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu) {
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
+#ifdef CONFIG_HAVE_MEMORYLESS_NODES
+ /*
+ * We now know the "local memory node" for each node--
+ * i.e., the node of the first zone in the generic zonelist.
+ * Set up numa_mem percpu variable for on-line cpus. During
+ * boot, only the boot cpu should be on-line; we'll init the
+ * secondary cpus' numa_mem as they come on-line. During
+ * node/memory hotplug, we'll fixup all on-line cpus.
+ */
+ if (cpu_online(cpu))
+ set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+#endif
+ }
+
return 0;
}
-void __meminit build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
+ set_zonelist_order();
+
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
+ mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
} else {
- /* we have to stop all cpus to guaranntee there is no user
+#ifdef CONFIG_MEMORY_HOTPLUG
+ if (zone)
+ setup_zone_pageset(zone);
+#endif
+ /* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+ stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
- printk("Built %i zonelists. Total pages: %ld\n",
- num_online_nodes(), vm_total_pages);
+ /*
+ * Disable grouping by mobility if the number of pages in the
+ * system is too low to allow the mechanism to work. It would be
+ * more accurate, but expensive to check per-zone. This check is
+ * made on memory-hotadd so a system can start with mobility
+ * disabled and enable it later
+ */
+ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
+ page_group_by_mobility_disabled = 1;
+ else
+ page_group_by_mobility_disabled = 0;
+
+ printk("Built %i zonelists in %s order, mobility grouping %s. "
+ "Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
+#ifdef CONFIG_NUMA
+ printk("Policy zone: %s\n", zone_names[policy_zone]);
+#endif
}
/*
@@ -1670,7 +3969,116 @@ static inline unsigned long wait_table_bits(unsigned long size)
return ffz(~size);
}
-#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+/*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+ * of blocks reserved is based on min_wmark_pages(zone). The memory within
+ * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * higher will lead to a bigger reserve which will get freed as contiguous
+ * blocks as reclaim kicks in
+ */
+static void setup_zone_migrate_reserve(struct zone *zone)
+{
+ unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
+ struct page *page;
+ unsigned long block_migratetype;
+ int reserve;
+ int old_reserve;
+
+ /*
+ * Get the start pfn, end pfn and the number of blocks to reserve
+ * We have to be careful to be aligned to pageblock_nr_pages to
+ * make sure that we always check pfn_valid for the first page in
+ * the block.
+ */
+ start_pfn = zone->zone_start_pfn;
+ end_pfn = zone_end_pfn(zone);
+ start_pfn = roundup(start_pfn, pageblock_nr_pages);
+ reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
+ pageblock_order;
+
+ /*
+ * Reserve blocks are generally in place to help high-order atomic
+ * allocations that are short-lived. A min_free_kbytes value that
+ * would result in more than 2 reserve blocks for atomic allocations
+ * is assumed to be in place to help anti-fragmentation for the
+ * future allocation of hugepages at runtime.
+ */
+ reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+
+ /* Watch out for overlapping nodes */
+ if (page_to_nid(page) != zone_to_nid(zone))
+ continue;
+
+ block_migratetype = get_pageblock_migratetype(page);
+
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;
+
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
+ }
+
+ /*
+ * If the reserve is met and this is a previous reserved block,
+ * take it back
+ */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ move_freepages_block(zone, page, MIGRATE_MOVABLE);
+ }
+ }
+}
/*
* Initially all pages are reserved - free ones are freed
@@ -1678,20 +4086,55 @@ static inline unsigned long wait_table_bits(unsigned long size)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
+ unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
+ struct zone *z;
+
+ if (highest_memmap_pfn < end_pfn - 1)
+ highest_memmap_pfn = end_pfn - 1;
+ z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!early_pfn_valid(pfn))
- continue;
+ /*
+ * There can be holes in boot-time mem_map[]s
+ * handed to this function. They do not
+ * exist on hotplugged memory.
+ */
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ }
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
+ mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
- reset_page_mapcount(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
SetPageReserved(page);
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made. Later some blocks near
+ * the start are marked MIGRATE_RESERVE by
+ * setup_zone_migrate_reserve()
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if ((z->zone_start_pfn <= pfn)
+ && (pfn < zone_end_pfn(z))
+ && !(pfn & (pageblock_nr_pages - 1)))
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -1701,37 +4144,23 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
}
-void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
- unsigned long size)
+static void __meminit zone_init_free_lists(struct zone *zone)
{
- int order;
- for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
+ unsigned int order, t;
+ for_each_migratetype_order(order, t) {
+ INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn))
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __cpuinit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
+#ifdef CONFIG_MMU
int batch;
/*
@@ -1740,7 +4169,7 @@ static int __cpuinit zone_batchsize(struct zone *zone)
*
* OK, so we don't know how big the cache is. So guess.
*/
- batch = zone->present_pages / 1024;
+ batch = zone->managed_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
@@ -1757,163 +4186,137 @@ static int __cpuinit zone_batchsize(struct zone *zone)
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
- batch = (1 << (fls(batch + batch/2)-1)) - 1;
+ batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
+
+#else
+ /* The deferral and batching of frees should be suppressed under NOMMU
+ * conditions.
+ *
+ * The problem is that NOMMU needs to be able to allocate large chunks
+ * of contiguous memory as there's no hardware page translation to
+ * assemble apparent contiguous memory from discontiguous pages.
+ *
+ * Queueing large contiguous runs of pages for batching, however,
+ * causes the pages to actually be freed in smaller chunks. As there
+ * can be a significant delay between the individual batches being
+ * recycled, this leads to the once large chunks of space being
+ * fragmented and becoming unavailable for high-order allocations.
+ */
+ return 0;
+#endif
}
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+/*
+ * pcp->high and pcp->batch values are related and dependent on one another:
+ * ->batch must never be higher then ->high.
+ * The following function updates them in a safe manner without read side
+ * locking.
+ *
+ * Any new users of pcp->batch and pcp->high should ensure they can cope with
+ * those fields changing asynchronously (acording the the above rule).
+ *
+ * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
+ * outside of boot time (or some other assurance that no concurrent updaters
+ * exist).
+ */
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
+ unsigned long batch)
+{
+ /* start with a fail safe value for batch */
+ pcp->batch = 1;
+ smp_wmb();
+
+ /* Update high, then batch, in order */
+ pcp->high = high;
+ smp_wmb();
+
+ pcp->batch = batch;
+}
+
+/* a companion to pageset_set_high() */
+static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+{
+ pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+}
+
+static void pageset_init(struct per_cpu_pageset *p)
{
struct per_cpu_pages *pcp;
+ int migratetype;
memset(p, 0, sizeof(*p));
- pcp = &p->pcp[0]; /* hot */
+ pcp = &p->pcp;
pcp->count = 0;
- pcp->high = 6 * batch;
- pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
+ for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+ INIT_LIST_HEAD(&pcp->lists[migratetype]);
+}
- pcp = &p->pcp[1]; /* cold*/
- pcp->count = 0;
- pcp->high = 2 * batch;
- pcp->batch = max(1UL, batch/2);
- INIT_LIST_HEAD(&pcp->list);
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+ pageset_init(p);
+ pageset_set_batch(p, batch);
}
/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+static void pageset_set_high(struct per_cpu_pageset *p,
unsigned long high)
{
- struct per_cpu_pages *pcp;
+ unsigned long batch = max(1UL, high / 4);
+ if ((high / 4) > (PAGE_SHIFT * 8))
+ batch = PAGE_SHIFT * 8;
- pcp = &p->pcp[0]; /* hot list */
- pcp->high = high;
- pcp->batch = max(1UL, high/4);
- if ((high/4) > (PAGE_SHIFT * 8))
- pcp->batch = PAGE_SHIFT * 8;
+ pageset_update(&p->pcp, high, batch);
}
-
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
-/*
- * Dynamically allocate memory for the
- * per cpu pageset array in struct zone.
- */
-static int __cpuinit process_zones(int cpu)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
- struct zone *zone, *dzone;
-
- for_each_zone(zone) {
-
- if (!populated_zone(zone))
- continue;
-
- zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
- GFP_KERNEL, cpu_to_node(cpu));
- if (!zone_pcp(zone, cpu))
- goto bad;
-
- setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
- if (percpu_pagelist_fraction)
- setup_pagelist_highmark(zone_pcp(zone, cpu),
- (zone->present_pages / percpu_pagelist_fraction));
- }
-
- return 0;
-bad:
- for_each_zone(dzone) {
- if (dzone == zone)
- break;
- kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = NULL;
- }
- return -ENOMEM;
+ if (percpu_pagelist_fraction)
+ pageset_set_high(pcp,
+ (zone->managed_pages /
+ percpu_pagelist_fraction));
+ else
+ pageset_set_batch(pcp, zone_batchsize(zone));
}
-static inline void free_zone_pagesets(int cpu)
+static void __meminit zone_pageset_init(struct zone *zone, int cpu)
{
- struct zone *zone;
+ struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
- for_each_zone(zone) {
- struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
- /* Free per_cpu_pageset if it is slab allocated */
- if (pset != &boot_pageset[cpu])
- kfree(pset);
- zone_pcp(zone, cpu) = NULL;
- }
+ pageset_init(pcp);
+ pageset_set_high_and_batch(zone, pcp);
}
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static void __meminit setup_zone_pageset(struct zone *zone)
{
- int cpu = (long)hcpu;
- int ret = NOTIFY_OK;
-
- switch (action) {
- case CPU_UP_PREPARE:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
- }
- return ret;
+ int cpu;
+ zone->pageset = alloc_percpu(struct per_cpu_pageset);
+ for_each_possible_cpu(cpu)
+ zone_pageset_init(zone, cpu);
}
-static struct notifier_block __cpuinitdata pageset_notifier =
- { &pageset_cpuup_callback, NULL, 0 };
-
+/*
+ * Allocate per cpu pagesets and initialize them.
+ * Before this call only boot pagesets were available.
+ */
void __init setup_per_cpu_pageset(void)
{
- int err;
+ struct zone *zone;
- /* Initialize per_cpu_pageset for cpu 0.
- * A cpuup callback will do this for every cpu
- * as it comes online
- */
- err = process_zones(smp_processor_id());
- BUG_ON(err);
- register_cpu_notifier(&pageset_notifier);
+ for_each_populated_zone(zone)
+ setup_zone_pageset(zone);
}
-#endif
-
-static __meminit
+static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
@@ -1927,9 +4330,10 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
alloc_size = zone->wait_table_hash_nr_entries
* sizeof(wait_queue_head_t);
- if (system_state == SYSTEM_BOOTING) {
+ if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -1941,12 +4345,12 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
* To use this new node's memory, further consideration will be
* necessary.
*/
- zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+ zone->wait_table = vmalloc(alloc_size);
}
if (!zone->wait_table)
return -ENOMEM;
- for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+ for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
init_waitqueue_head(zone->wait_table + i);
return 0;
@@ -1954,26 +4358,23 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
static __meminit void zone_pcp_init(struct zone *zone)
{
- int cpu;
- unsigned long batch = zone_batchsize(zone);
+ /*
+ * per cpu subsystem is not up at this point. The following code
+ * relies on the ability of the linker to provide the
+ * offset of a (static) per cpu variable into the per cpu area.
+ */
+ zone->pageset = &boot_pageset;
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
- /* Early boot. Slab allocator not functional yet */
- zone_pcp(zone, cpu) = &boot_pageset[cpu];
- setup_pageset(&boot_pageset[cpu],0);
-#else
- setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
- }
- if (zone->present_pages)
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone->name, zone->present_pages, batch);
+ if (populated_zone(zone))
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
+ zone->name, zone->present_pages,
+ zone_batchsize(zone));
}
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size)
+ unsigned long size,
+ enum memmap_context context)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
@@ -1984,222 +4385,215 @@ __meminit int init_currently_empty_zone(struct zone *zone,
zone->zone_start_pfn = zone_start_pfn;
- memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
+ mminit_dprintk(MMINIT_TRACE, "memmap_init",
+ "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+ pgdat->node_id,
+ (unsigned long)zone_idx(zone),
+ zone_start_pfn, (zone_start_pfn + size));
- zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+ zone_init_free_lists(zone);
return 0;
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
- * Basic iterator support. Return the first range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns first region regardless of node
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
-static int __init first_active_region_index_in_nid(int nid)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
{
- int i;
+ unsigned long start_pfn, end_pfn;
+ int nid;
+ /*
+ * NOTE: The following SMP-unsafe globals are only used early in boot
+ * when the kernel is running single-threaded.
+ */
+ static unsigned long __meminitdata last_start_pfn, last_end_pfn;
+ static int __meminitdata last_nid;
+
+ if (last_start_pfn <= pfn && pfn < last_end_pfn)
+ return last_nid;
- for (i = 0; i < nr_nodemap_entries; i++)
- if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
- return i;
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != -1) {
+ last_start_pfn = start_pfn;
+ last_end_pfn = end_pfn;
+ last_nid = nid;
+ }
- return -1;
+ return nid;
}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-/*
- * Basic iterator support. Return the next active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardles of node
- */
-static int __init next_active_region_index_in_nid(int index, int nid)
+int __meminit early_pfn_to_nid(unsigned long pfn)
{
- for (index = index + 1; index < nr_nodemap_entries; index++)
- if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
- return index;
+ int nid;
- return -1;
+ nid = __early_pfn_to_nid(pfn);
+ if (nid >= 0)
+ return nid;
+ /* just returns 0 */
+ return 0;
}
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- * Architectures may implement their own version but if add_active_range()
- * was used and there are no special requirements, this is a convenient
- * alternative
- */
-int __init early_pfn_to_nid(unsigned long pfn)
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
- int i;
-
- for (i = 0; i < nr_nodemap_entries; i++) {
- unsigned long start_pfn = early_node_map[i].start_pfn;
- unsigned long end_pfn = early_node_map[i].end_pfn;
-
- if (start_pfn <= pfn && pfn < end_pfn)
- return early_node_map[i].nid;
- }
+ int nid;
- return 0;
+ nid = __early_pfn_to_nid(pfn);
+ if (nid >= 0 && nid != node)
+ return false;
+ return true;
}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-/* Basic iterator support to walk early_node_map[] */
-#define for_each_active_range_index_in_nid(i, nid) \
- for (i = first_active_region_index_in_nid(nid); i != -1; \
- i = next_active_region_index_in_nid(i, nid))
+#endif
/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
- * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * If an architecture guarantees that all ranges registered contain no holes
+ * and may be freed, this this function may be used instead of calling
+ * memblock_free_early_nid() manually.
*/
-void __init free_bootmem_with_active_regions(int nid,
- unsigned long max_low_pfn)
+void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
- int i;
-
- for_each_active_range_index_in_nid(i, nid) {
- unsigned long size_pages = 0;
- unsigned long end_pfn = early_node_map[i].end_pfn;
-
- if (early_node_map[i].start_pfn >= max_low_pfn)
- continue;
+ unsigned long start_pfn, end_pfn;
+ int i, this_nid;
- if (end_pfn > max_low_pfn)
- end_pfn = max_low_pfn;
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+ start_pfn = min(start_pfn, max_low_pfn);
+ end_pfn = min(end_pfn, max_low_pfn);
- size_pages = end_pfn - early_node_map[i].start_pfn;
- free_bootmem_node(NODE_DATA(early_node_map[i].nid),
- PFN_PHYS(early_node_map[i].start_pfn),
- size_pages << PAGE_SHIFT);
+ if (start_pfn < end_pfn)
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
*
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling memory_present() manually.
+ * If an architecture guarantees that all ranges registered contain no holes and may
+ * be freed, this function may be used instead of calling memory_present() manually.
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
- int i;
+ unsigned long start_pfn, end_pfn;
+ int i, this_nid;
- for_each_active_range_index_in_nid(i, nid)
- memory_present(early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+ memory_present(this_nid, start_pfn, end_pfn);
}
/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
- nid, start_pfn, end_pfn);
-
- /* Initialise the boundary for this node if necessary */
- if (node_boundary_end_pfn[nid] == 0)
- node_boundary_start_pfn[nid] = -1UL;
-
- /* Update the boundaries */
- if (node_boundary_start_pfn[nid] > start_pfn)
- node_boundary_start_pfn[nid] = start_pfn;
- if (node_boundary_end_pfn[nid] < end_pfn)
- node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __init account_node_boundary(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn)
-{
- printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
- nid, *start_pfn, *end_pfn);
-
- /* Return if boundary information has not been provided */
- if (node_boundary_end_pfn[nid] == 0)
- return;
-
- /* Check the boundaries and update if necessary */
- if (node_boundary_start_pfn[nid] < *start_pfn)
- *start_pfn = node_boundary_start_pfn[nid];
- if (node_boundary_end_pfn[nid] > *end_pfn)
- *end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
- unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __init account_node_boundary(unsigned int nid,
- unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
-/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
- * provided by an arch calling add_active_range(). If called for a node
+ * provided by memblock_set_node(). If called for a node
* with no available memory, a warning is printed and the start and end
- * PFNs will be 0
+ * PFNs will be 0.
*/
-void __init get_pfn_range_for_nid(unsigned int nid,
+void __meminit get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
+ unsigned long this_start_pfn, this_end_pfn;
int i;
+
*start_pfn = -1UL;
*end_pfn = 0;
- for_each_active_range_index_in_nid(i, nid) {
- *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
- *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+ for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+ *start_pfn = min(*start_pfn, this_start_pfn);
+ *end_pfn = max(*end_pfn, this_end_pfn);
}
- if (*start_pfn == -1UL) {
- printk(KERN_WARNING "Node %u active with no memory\n", nid);
+ if (*start_pfn == -1UL)
*start_pfn = 0;
+}
+
+/*
+ * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * assumption is made that zones within a node are ordered in monotonic
+ * increasing memory addresses so that the "highest" populated zone is used
+ */
+static void __init find_usable_zone_for_movable(void)
+{
+ int zone_index;
+ for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
+ if (zone_index == ZONE_MOVABLE)
+ continue;
+
+ if (arch_zone_highest_possible_pfn[zone_index] >
+ arch_zone_lowest_possible_pfn[zone_index])
+ break;
}
- /* Push the node boundaries out if requested */
- account_node_boundary(nid, start_pfn, end_pfn);
+ VM_BUG_ON(zone_index == -1);
+ movable_zone = zone_index;
+}
+
+/*
+ * The zone ranges provided by the architecture do not include ZONE_MOVABLE
+ * because it is sized independent of architecture. Unlike the other zones,
+ * the starting point for ZONE_MOVABLE is not fixed. It may be different
+ * in each node depending on the size of each node and how evenly kernelcore
+ * is distributed. This helper function adjusts the zone ranges
+ * provided by the architecture for a given node by using the end of the
+ * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
+ * zones within a node are in order of monotonic increases memory addresses
+ */
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
+ unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn)
+{
+ /* Only adjust if ZONE_MOVABLE is on this node */
+ if (zone_movable_pfn[nid]) {
+ /* Size ZONE_MOVABLE */
+ if (zone_type == ZONE_MOVABLE) {
+ *zone_start_pfn = zone_movable_pfn[nid];
+ *zone_end_pfn = min(node_end_pfn,
+ arch_zone_highest_possible_pfn[movable_zone]);
+
+ /* Adjust for ZONE_MOVABLE starting within this range */
+ } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
+ *zone_end_pfn > zone_movable_pfn[nid]) {
+ *zone_end_pfn = zone_movable_pfn[nid];
+
+ /* Check if this whole range is within ZONE_MOVABLE */
+ } else if (*zone_start_pfn >= zone_movable_pfn[nid])
+ *zone_start_pfn = *zone_end_pfn;
+ }
}
/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-unsigned long __init zone_spanned_pages_in_node(int nid,
+static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *ignored)
{
- unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
- /* Get the start and end of the node and zone */
- get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ /* Get the start and end of the zone */
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2215,52 +4609,22 @@ unsigned long __init zone_spanned_pages_in_node(int nid,
/*
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for
+ * then all holes in the requested range will be accounted for.
*/
-unsigned long __init __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
- int i = 0;
- unsigned long prev_end_pfn = 0, hole_pages = 0;
- unsigned long start_pfn;
-
- /* Find the end_pfn of the first active range of pfns in the node */
- i = first_active_region_index_in_nid(nid);
- if (i == -1)
- return 0;
-
- /* Account for ranges before physical memory on this node */
- if (early_node_map[i].start_pfn > range_start_pfn)
- hole_pages = early_node_map[i].start_pfn - range_start_pfn;
-
- prev_end_pfn = early_node_map[i].start_pfn;
-
- /* Find all holes for the zone within the node */
- for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
-
- /* No need to continue if prev_end_pfn is outside the zone */
- if (prev_end_pfn >= range_end_pfn)
- break;
-
- /* Make sure the end of the zone is not within the hole */
- start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
- prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+ unsigned long nr_absent = range_end_pfn - range_start_pfn;
+ unsigned long start_pfn, end_pfn;
+ int i;
- /* Update the hole size cound and move on */
- if (start_pfn > range_start_pfn) {
- BUG_ON(prev_end_pfn > start_pfn);
- hole_pages += start_pfn - prev_end_pfn;
- }
- prev_end_pfn = early_node_map[i].end_pfn;
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ nr_absent -= end_pfn - start_pfn;
}
-
- /* Account for ranges past physical memory on this node */
- if (range_end_pfn > prev_end_pfn)
- hole_pages = range_end_pfn -
- max(range_start_pfn, prev_end_pfn);
-
- return hole_pages;
+ return nr_absent;
}
/**
@@ -2268,7 +4632,7 @@ unsigned long __init __absent_pages_in_range(int nid,
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range
+ * It returns the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -2277,45 +4641,39 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
}
/* Return the number of page frames in holes in a zone on a node */
-unsigned long __init zone_absent_pages_in_node(int nid,
+static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *ignored)
{
- unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
- get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
- zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
- node_start_pfn);
- zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
- node_end_pfn);
+ zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
+ adjust_zone_range_for_zone_movable(nid, zone_type,
+ node_start_pfn, node_end_pfn,
+ &zone_start_pfn, &zone_end_pfn);
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
-/* Return the zone index a PFN is in */
-int memmap_zone_idx(struct page *lmem_map)
-{
- int i;
- unsigned long phys_addr = virt_to_phys(lmem_map);
- unsigned long pfn = phys_addr >> PAGE_SHIFT;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- if (pfn < arch_zone_highest_possible_pfn[i])
- break;
-
- return i;
-}
-#else
-static inline unsigned long zone_spanned_pages_in_node(int nid,
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *zones_size)
{
return zones_size[zone_type];
}
-static inline unsigned long zone_absent_pages_in_node(int nid,
+static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *zholes_size)
{
if (!zholes_size)
@@ -2324,40 +4682,140 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
return zholes_size[zone_type];
}
-static inline int memmap_zone_idx(struct page *lmem_map)
-{
- return MAX_NR_ZONES;
-}
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
+static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zones_size,
+ unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- zones_size);
+ node_start_pfn,
+ node_end_pfn,
+ zones_size);
pgdat->node_spanned_pages = totalpages;
realtotalpages = totalpages;
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -=
zone_absent_pages_in_node(pgdat->node_id, i,
- zholes_size);
+ node_start_pfn, node_end_pfn,
+ zholes_size);
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
}
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
+ usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = usemapsize >> pageblock_order;
+ usemapsize *= NR_PAGEBLOCK_BITS;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __init setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long zonesize)
+{
+ unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
+ zone->pageblock_flags = NULL;
+ if (usemapsize)
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
+}
+#else
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
+
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+void __paginginit set_pageblock_order(void)
+{
+ unsigned int order;
+
+ /* Check that pageblock_nr_pages has not already been setup */
+ if (pageblock_order)
+ return;
+
+ if (HPAGE_SHIFT > PAGE_SHIFT)
+ order = HUGETLB_PAGE_ORDER;
+ else
+ order = MAX_ORDER - 1;
+
+ /*
+ * Assume the largest contiguous order of interest is a huge page.
+ * This value may be variable depending on boot parameters on IA64 and
+ * powerpc.
+ */
+ pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+/*
+ * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
+ * is unused as pageblock_order is set at compile-time. See
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
+ * the kernel config
+ */
+void __paginginit set_pageblock_order(void)
+{
+}
+
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+
+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not naturally algined on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
*/
-static void __meminit free_area_init_core(struct pglist_data *pgdat,
+static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+ unsigned long node_start_pfn, unsigned long node_end_pfn,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
@@ -2366,82 +4824,96 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
- pgdat->nr_zones = 0;
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
- pgdat->kswapd_max_order = 0;
-
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+ pgdat_page_cgroup_init(pgdat);
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, memmap_pages;
+ unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, zones_size);
- realsize = size - zone_absent_pages_in_node(nid, j,
+ size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
+ node_end_pfn, zones_size);
+ realsize = freesize = size - zone_absent_pages_in_node(nid, j,
+ node_start_pfn,
+ node_end_pfn,
zholes_size);
/*
- * Adjust realsize so that it accounts for how much memory
+ * Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
- if (realsize >= memmap_pages) {
- realsize -= memmap_pages;
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
+ memmap_pages = calc_memmap_size(size, realsize);
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
+ if (memmap_pages)
+ printk(KERN_DEBUG
+ " %s zone: %lu pages used for memmap\n",
+ zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
- " %s zone: %lu pages exceeds realsize %lu\n",
- zone_names[j], memmap_pages, realsize);
-
- /* Account for reserved DMA pages */
- if (j == ZONE_DMA && realsize > dma_reserve) {
- realsize -= dma_reserve;
- printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
- dma_reserve);
+ " %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
+
+ /* Account for reserved pages */
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
+ printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
+ zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
- nr_kernel_pages += realsize;
- nr_all_pages += realsize;
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
zone->spanned_pages = size;
zone->present_pages = realsize;
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+ zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+ zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
- zone->free_pages = 0;
+ zone_pcp_init(zone);
- zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+ /* For bootup, initialized properly in watermark setup */
+ mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
- zone_pcp_init(zone);
- INIT_LIST_HEAD(&zone->active_list);
- INIT_LIST_HEAD(&zone->inactive_list);
- zone->nr_scan_active = 0;
- zone->nr_scan_inactive = 0;
- zone->nr_active = 0;
- zone->nr_inactive = 0;
- zap_zone_vm_stats(zone);
- atomic_set(&zone->reclaim_in_progress, 0);
+ lruvec_init(&zone->lruvec);
if (!size)
continue;
- zonetable_add(zone, nid, j, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ set_pageblock_order();
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn,
+ size, MEMMAP_EARLY);
BUG_ON(ret);
+ memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
@@ -2459,216 +4931,371 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
-#ifdef CONFIG_FLATMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= pgdat->node_start_pfn;
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+ mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
-void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long node_start_pfn,
- unsigned long *zholes_size)
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn, unsigned long *zholes_size)
{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = 0;
+ unsigned long end_pfn = 0;
+
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- calculate_node_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+#endif
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn,
+ zones_size, zholes_size);
alloc_node_mem_map(pgdat);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ nid, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
+#endif
- free_area_init_core(pgdat, zones_size, zholes_size);
+ free_area_init_core(pgdat, start_pfn, end_pfn,
+ zones_size, zholes_size);
}
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
- *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
*/
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
- unsigned long end_pfn)
+void __init setup_nr_node_ids(void)
{
- int i;
+ unsigned int node;
+ unsigned int highest = 0;
- printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
- "%d entries of %d used\n",
- nid, start_pfn, end_pfn,
- nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ nr_node_ids = highest + 1;
+}
+#endif
- /* Merge with existing active regions if possible */
- for (i = 0; i < nr_nodemap_entries; i++) {
- if (early_node_map[i].nid != nid)
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ unsigned long start, end, mask;
+ int last_nid = -1;
+ int i, nid;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
continue;
-
- /* Skip if an existing region covers this new one */
- if (start_pfn >= early_node_map[i].start_pfn &&
- end_pfn <= early_node_map[i].end_pfn)
- return;
-
- /* Merge forward if suitable */
- if (start_pfn <= early_node_map[i].end_pfn &&
- end_pfn > early_node_map[i].end_pfn) {
- early_node_map[i].end_pfn = end_pfn;
- return;
}
- /* Merge backward if suitable */
- if (start_pfn < early_node_map[i].end_pfn &&
- end_pfn >= early_node_map[i].start_pfn) {
- early_node_map[i].start_pfn = start_pfn;
- return;
- }
- }
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
- /* Check that early_node_map is large enough */
- if (i >= MAX_ACTIVE_REGIONS) {
- printk(KERN_CRIT "More than %d memory regions, truncating\n",
- MAX_ACTIVE_REGIONS);
- return;
+ /* accumulate all internode masks */
+ accl_mask |= mask;
}
- early_node_map[i].nid = nid;
- early_node_map[i].start_pfn = start_pfn;
- early_node_map[i].end_pfn = end_pfn;
- nr_nodemap_entries = i + 1;
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
}
-/**
- * shrink_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @old_end_pfn: The old end PFN of the range
- * @new_end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept at the end physical page range that has already been
- * registered with add_active_range(). This function allows an arch to shrink
- * an existing registered range.
- */
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
- unsigned long new_end_pfn)
+/* Find the lowest pfn for a node */
+static unsigned long __init find_min_pfn_for_node(int nid)
{
+ unsigned long min_pfn = ULONG_MAX;
+ unsigned long start_pfn;
int i;
- /* Find the old active region end and shrink */
- for_each_active_range_index_in_nid(i, nid)
- if (early_node_map[i].end_pfn == old_end_pfn) {
- early_node_map[i].end_pfn = new_end_pfn;
- break;
- }
+ for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+ min_pfn = min(min_pfn, start_pfn);
+
+ if (min_pfn == ULONG_MAX) {
+ printk(KERN_WARNING
+ "Could not find start_pfn for node %d\n", nid);
+ return 0;
+ }
+
+ return min_pfn;
}
/**
- * remove_all_active_ranges - Remove all currently registered regions
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
+ * find_min_pfn_with_active_regions - Find the minimum PFN registered
+ *
+ * It returns the minimum PFN based on information provided via
+ * memblock_set_node().
*/
-void __init remove_all_active_ranges()
+unsigned long __init find_min_pfn_with_active_regions(void)
{
- memset(early_node_map, 0, sizeof(early_node_map));
- nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
- memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
- memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
+ return find_min_pfn_for_node(MAX_NUMNODES);
}
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
{
- struct node_active_region *arange = (struct node_active_region *)a;
- struct node_active_region *brange = (struct node_active_region *)b;
+ unsigned long totalpages = 0;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
- /* Done this way to avoid overflows */
- if (arange->start_pfn > brange->start_pfn)
- return 1;
- if (arange->start_pfn < brange->start_pfn)
- return -1;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ unsigned long pages = end_pfn - start_pfn;
- return 0;
+ totalpages += pages;
+ if (pages)
+ node_set_state(nid, N_MEMORY);
+ }
+ return totalpages;
}
-/* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+/*
+ * Find the PFN the Movable zone begins in each node. Kernel memory
+ * is spread evenly between nodes as long as the nodes have enough
+ * memory. When they don't, some nodes will have more kernelcore than
+ * others
+ */
+static void __init find_zone_movable_pfns_for_nodes(void)
{
- sort(early_node_map, (size_t)nr_nodemap_entries,
- sizeof(struct node_active_region),
- cmp_node_active_region, NULL);
-}
+ int i, nid;
+ unsigned long usable_startpfn;
+ unsigned long kernelcore_node, kernelcore_remaining;
+ /* save the state before borrow the nodemask */
+ nodemask_t saved_node_state = node_states[N_MEMORY];
+ unsigned long totalpages = early_calculate_totalpages();
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_region *r;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
-/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
-unsigned long __init find_min_pfn_for_node(unsigned long nid)
-{
- int i;
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for_each_memblock(memory, r) {
+ if (!memblock_is_hotpluggable(r))
+ continue;
- /* Assuming a sorted map, the first range found has the starting pfn */
- for_each_active_range_index_in_nid(i, nid)
- return early_node_map[i].start_pfn;
+ nid = r->nid;
- printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
- return 0;
-}
+ usable_startpfn = PFN_DOWN(r->base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
-/**
- * find_min_pfn_with_active_regions - Find the minimum PFN registered
- *
- * It returns the minimum PFN based on information provided via
- * add_active_range()
- */
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return find_min_pfn_for_node(MAX_NUMNODES);
+ goto out2;
+ }
+
+ /*
+ * If movablecore=nn[KMG] was specified, calculate what size of
+ * kernelcore that corresponds so that memory usable for
+ * any allocation type is evenly spread. If both kernelcore
+ * and movablecore are specified, then the value of kernelcore
+ * will be used for required_kernelcore if it's greater than
+ * what movablecore would have allowed.
+ */
+ if (required_movablecore) {
+ unsigned long corepages;
+
+ /*
+ * Round-up so that ZONE_MOVABLE is at least as large as what
+ * was requested by the user
+ */
+ required_movablecore =
+ roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ corepages = totalpages - required_movablecore;
+
+ required_kernelcore = max(required_kernelcore, corepages);
+ }
+
+ /* If kernelcore was not specified, there is no ZONE_MOVABLE */
+ if (!required_kernelcore)
+ goto out;
+
+ /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
+ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+
+restart:
+ /* Spread kernelcore memory as evenly as possible throughout nodes */
+ kernelcore_node = required_kernelcore / usable_nodes;
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ /*
+ * Recalculate kernelcore_node if the division per node
+ * now exceeds what is necessary to satisfy the requested
+ * amount of memory for the kernel
+ */
+ if (required_kernelcore < kernelcore_node)
+ kernelcore_node = required_kernelcore / usable_nodes;
+
+ /*
+ * As the map is walked, we track how much memory is usable
+ * by the kernel using kernelcore_remaining. When it is
+ * 0, the rest of the node is usable by ZONE_MOVABLE
+ */
+ kernelcore_remaining = kernelcore_node;
+
+ /* Go through each range of PFNs within this node */
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ unsigned long size_pages;
+
+ start_pfn = max(start_pfn, zone_movable_pfn[nid]);
+ if (start_pfn >= end_pfn)
+ continue;
+
+ /* Account for what is only usable for kernelcore */
+ if (start_pfn < usable_startpfn) {
+ unsigned long kernel_pages;
+ kernel_pages = min(end_pfn, usable_startpfn)
+ - start_pfn;
+
+ kernelcore_remaining -= min(kernel_pages,
+ kernelcore_remaining);
+ required_kernelcore -= min(kernel_pages,
+ required_kernelcore);
+
+ /* Continue if range is now fully accounted */
+ if (end_pfn <= usable_startpfn) {
+
+ /*
+ * Push zone_movable_pfn to the end so
+ * that if we have to rebalance
+ * kernelcore across nodes, we will
+ * not double account here
+ */
+ zone_movable_pfn[nid] = end_pfn;
+ continue;
+ }
+ start_pfn = usable_startpfn;
+ }
+
+ /*
+ * The usable PFN range for ZONE_MOVABLE is from
+ * start_pfn->end_pfn. Calculate size_pages as the
+ * number of pages used as kernelcore
+ */
+ size_pages = end_pfn - start_pfn;
+ if (size_pages > kernelcore_remaining)
+ size_pages = kernelcore_remaining;
+ zone_movable_pfn[nid] = start_pfn + size_pages;
+
+ /*
+ * Some kernelcore has been met, update counts and
+ * break if the kernelcore for this node has been
+ * satisfied
+ */
+ required_kernelcore -= min(required_kernelcore,
+ size_pages);
+ kernelcore_remaining -= size_pages;
+ if (!kernelcore_remaining)
+ break;
+ }
+ }
+
+ /*
+ * If there is still required_kernelcore, we do another pass with one
+ * less node in the count. This will push zone_movable_pfn[nid] further
+ * along on the nodes that still have memory until kernelcore is
+ * satisfied
+ */
+ usable_nodes--;
+ if (usable_nodes && required_kernelcore > usable_nodes)
+ goto restart;
+
+out2:
+ /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ zone_movable_pfn[nid] =
+ roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+out:
+ /* restore the node_state */
+ node_states[N_MEMORY] = saved_node_state;
}
-/**
- * find_max_pfn_with_active_regions - Find the maximum PFN registered
- *
- * It returns the maximum PFN based on information provided via
- * add_active_range()
- */
-unsigned long __init find_max_pfn_with_active_regions(void)
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
{
- int i;
- unsigned long max_pfn = 0;
+ enum zone_type zone_type;
- for (i = 0; i < nr_nodemap_entries; i++)
- max_pfn = max(max_pfn, early_node_map[i].end_pfn);
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ return;
- return max_pfn;
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+ struct zone *zone = &pgdat->node_zones[zone_type];
+ if (populated_zone(zone)) {
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
+ zone_type <= ZONE_NORMAL)
+ node_set_state(nid, N_NORMAL_MEMORY);
+ break;
+ }
+ }
}
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
- * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
- * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
- * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
- * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
+ * @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
+ * Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -2678,8 +5305,8 @@ unsigned long __init find_max_pfn_with_active_regions(void)
*/
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
- unsigned long nid;
- enum zone_type i;
+ unsigned long start_pfn, end_pfn;
+ int i, nid;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -2689,83 +5316,250 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
+ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
+ arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
- /* Regions in the early_node_map can be in any order */
- sort_node_map();
+ /* Find the PFNs that ZONE_MOVABLE begins at in each node */
+ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
+ find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
- printk("Zone PFN ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %-8s %8lu -> %8lu\n",
- zone_names[i],
- arch_zone_lowest_possible_pfn[i],
- arch_zone_highest_possible_pfn[i]);
-
- /* Print out the early_node_map[] */
- printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
- for (i = 0; i < nr_nodemap_entries; i++)
- printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
+ printk("Zone ranges:\n");
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (i == ZONE_MOVABLE)
+ continue;
+ printk(KERN_CONT " %-8s ", zone_names[i]);
+ if (arch_zone_lowest_possible_pfn[i] ==
+ arch_zone_highest_possible_pfn[i])
+ printk(KERN_CONT "empty\n");
+ else
+ printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
+ arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+ (arch_zone_highest_possible_pfn[i]
+ << PAGE_SHIFT) - 1);
+ }
+
+ /* Print out the PFNs ZONE_MOVABLE begins at in each node */
+ printk("Movable zone start for each node\n");
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (zone_movable_pfn[i])
+ printk(" Node %d: %#010lx\n", i,
+ zone_movable_pfn[i] << PAGE_SHIFT);
+ }
+
+ /* Print out the early node map */
+ printk("Early memory node ranges\n");
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
+ start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
+ mminit_verify_pageflags_layout();
+ setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, pgdat, NULL,
+ free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
+
+ /* Any memory on that node */
+ if (pgdat->node_present_pages)
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat, nid);
}
}
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+
+static int __init cmdline_parse_core(char *p, unsigned long *core)
+{
+ unsigned long long coremem;
+ if (!p)
+ return -EINVAL;
+
+ coremem = memparse(p, &p);
+ *core = coremem >> PAGE_SHIFT;
+
+ /* Paranoid check that UL is enough for the coremem value */
+ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
+
+ return 0;
+}
+
+/*
+ * kernelcore=size sets the amount of memory for use for allocations that
+ * cannot be reclaimed or migrated.
+ */
+static int __init cmdline_parse_kernelcore(char *p)
+{
+ return cmdline_parse_core(p, &required_kernelcore);
+}
+
+/*
+ * movablecore=size sets the amount of memory for use for allocations that
+ * can be reclaimed or migrated.
+ */
+static int __init cmdline_parse_movablecore(char *p)
+{
+ return cmdline_parse_core(p, &required_movablecore);
+}
+
+early_param("kernelcore", cmdline_parse_kernelcore);
+early_param("movablecore", cmdline_parse_movablecore);
+
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+void adjust_managed_page_count(struct page *page, long count)
+{
+ spin_lock(&managed_page_count_lock);
+ page_zone(page)->managed_pages += count;
+ totalram_pages += count;
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages += count;
+#endif
+ spin_unlock(&managed_page_count_lock);
+}
+EXPORT_SYMBOL(adjust_managed_page_count);
+
+unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+{
+ void *pos;
+ unsigned long pages = 0;
+
+ start = (void *)PAGE_ALIGN((unsigned long)start);
+ end = (void *)((unsigned long)end & PAGE_MASK);
+ for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
+ if ((unsigned int)poison <= 0xFF)
+ memset(pos, poison, PAGE_SIZE);
+ free_reserved_page(virt_to_page(pos));
+ }
+
+ if (pages && s)
+ pr_info("Freeing %s memory: %ldK (%p - %p)\n",
+ s, pages << (PAGE_SHIFT - 10), start, end);
+
+ return pages;
+}
+EXPORT_SYMBOL(free_reserved_area);
+
+#ifdef CONFIG_HIGHMEM
+void free_highmem_page(struct page *page)
+{
+ __free_reserved_page(page);
+ totalram_pages++;
+ page_zone(page)->managed_pages++;
+ totalhigh_pages++;
+}
+#endif
+
+
+void __init mem_init_print_info(const char *str)
+{
+ unsigned long physpages, codesize, datasize, rosize, bss_size;
+ unsigned long init_code_size, init_data_size;
+
+ physpages = get_num_physpages();
+ codesize = _etext - _stext;
+ datasize = _edata - _sdata;
+ rosize = __end_rodata - __start_rodata;
+ bss_size = __bss_stop - __bss_start;
+ init_data_size = __init_end - __init_begin;
+ init_code_size = _einittext - _sinittext;
+
+ /*
+ * Detect special cases and adjust section sizes accordingly:
+ * 1) .init.* may be embedded into .data sections
+ * 2) .init.text.* may be out of [__init_begin, __init_end],
+ * please refer to arch/tile/kernel/vmlinux.lds.S.
+ * 3) .rodata.* may be embedded into .text or .data sections.
+ */
+#define adj_init_size(start, end, size, pos, adj) \
+ do { \
+ if (start <= pos && pos < end && size > adj) \
+ size -= adj; \
+ } while (0)
+
+ adj_init_size(__init_begin, __init_end, init_data_size,
+ _sinittext, init_code_size);
+ adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+ adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+ adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+ adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef adj_init_size
+
+ printk("Memory: %luK/%luK available "
+ "(%luK kernel code, %luK rwdata, %luK rodata, "
+ "%luK init, %luK bss, %luK reserved"
+#ifdef CONFIG_HIGHMEM
+ ", %luK highmem"
+#endif
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages) << (PAGE_SHIFT-10),
+#ifdef CONFIG_HIGHMEM
+ totalhigh_pages << (PAGE_SHIFT-10),
+#endif
+ str ? ", " : "", str ? str : "");
+}
/**
- * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
- * @new_dma_reserve - The number of pages to mark reserved
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
*
* The per-cpu batchsize and zone watermarks are determined by present_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in
- * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
*/
void __init set_dma_reserve(unsigned long new_dma_reserve)
{
dma_reserve = new_dma_reserve;
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
-
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_node(0, NODE_DATA(0), zones_size,
+ free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
- if (action == CPU_DEAD) {
- local_irq_disable();
- __drain_pages(cpu);
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ lru_add_drain_cpu(cpu);
+ drain_pages(cpu);
+
+ /*
+ * Spill the event counters of the dead processor
+ * into the current processors event counters.
+ * This artificially elevates the count of the current
+ * processor.
+ */
vm_events_fold_cpu(cpu);
- local_irq_enable();
- refresh_cpu_vm_stats(cpu);
+
+ /*
+ * Zero the differential counters of the dead processor
+ * so that the vm statistics are consistent.
+ *
+ * This is only okay since the processor is dead and cannot
+ * race with what we are doing.
+ */
+ cpu_vm_stats_fold(cpu);
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
@@ -2793,14 +5587,25 @@ static void calculate_totalreserve_pages(void)
max = zone->lowmem_reserve[j];
}
- /* we treat pages_high as reserved pages. */
- max += zone->pages_high;
+ /* we treat the high watermark as reserved pages. */
+ max += high_wmark_pages(zone);
- if (max > zone->present_pages)
- max = zone->present_pages;
+ if (max > zone->managed_pages)
+ max = zone->managed_pages;
reserve_pages += max;
+ /*
+ * Lowmem reserves are not available to
+ * GFP_HIGHUSER page cache allocations and
+ * kswapd tries to balance zones to their high
+ * watermark. As a result, neither should be
+ * regarded as dirtyable memory, to prevent a
+ * situation where reclaim has to clean pages
+ * in order to balance the zones.
+ */
+ zone->dirty_balance_reserve = max;
}
}
+ dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
@@ -2818,7 +5623,7 @@ static void setup_per_zone_lowmem_reserve(void)
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long present_pages = zone->present_pages;
+ unsigned long managed_pages = zone->managed_pages;
zone->lowmem_reserve[j] = 0;
@@ -2832,9 +5637,9 @@ static void setup_per_zone_lowmem_reserve(void)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
- lower_zone->lowmem_reserve[j] = present_pages /
+ lower_zone->lowmem_reserve[j] = managed_pages /
sysctl_lowmem_reserve_ratio[idx];
- present_pages += lower_zone->present_pages;
+ managed_pages += lower_zone->managed_pages;
}
}
}
@@ -2843,12 +5648,7 @@ static void setup_per_zone_lowmem_reserve(void)
calculate_totalreserve_pages();
}
-/*
- * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
- * that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
- */
-void setup_per_zone_pages_min(void)
+static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
@@ -2858,14 +5658,14 @@ void setup_per_zone_pages_min(void)
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
- lowmem_pages += zone->present_pages;
+ lowmem_pages += zone->managed_pages;
}
for_each_zone(zone) {
u64 tmp;
- spin_lock_irqsave(&zone->lru_lock, flags);
- tmp = (u64)pages_min * zone->present_pages;
+ spin_lock_irqsave(&zone->lock, flags);
+ tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
@@ -2873,35 +5673,96 @@ void setup_per_zone_pages_min(void)
* need highmem pages, so cap pages_min to a small
* value here.
*
- * The (pages_high-pages_low) and (pages_low-pages_min)
+ * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas controls asynch page reclaim, and so should
* not be capped for highmem.
*/
- int min_pages;
-
- min_pages = zone->present_pages / 1024;
- if (min_pages < SWAP_CLUSTER_MAX)
- min_pages = SWAP_CLUSTER_MAX;
- if (min_pages > 128)
- min_pages = 128;
- zone->pages_min = min_pages;
+ unsigned long min_pages;
+
+ min_pages = zone->managed_pages / 1024;
+ min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
+ zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = tmp;
+ zone->watermark[WMARK_MIN] = tmp;
}
- zone->pages_low = zone->pages_min + (tmp >> 2);
- zone->pages_high = zone->pages_min + (tmp >> 1);
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ high_wmark_pages(zone) -
+ low_wmark_pages(zone) -
+ zone_page_state(zone, NR_ALLOC_BATCH));
+
+ setup_zone_migrate_reserve(zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+ mutex_lock(&zonelists_mutex);
+ __setup_per_zone_wmarks();
+ mutex_unlock(&zonelists_mutex);
+}
+
+/*
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total target max
+ * memory ratio inactive anon
+ * -------------------------------------
+ * 10MB 1 5MB
+ * 100MB 1 50MB
+ * 1GB 3 250MB
+ * 10GB 10 0.9GB
+ * 100GB 31 3GB
+ * 1TB 101 10GB
+ * 10TB 320 32GB
+ */
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
+{
+ unsigned int gb, ratio;
+
+ /* Zone size in gigabytes */
+ gb = zone->managed_pages >> (30 - PAGE_SHIFT);
+ if (gb)
+ ratio = int_sqrt(10 * gb);
+ else
+ ratio = 1;
+
+ zone->inactive_ratio = ratio;
+}
+
+static void __meminit setup_per_zone_inactive_ratio(void)
+{
+ struct zone *zone;
+
+ for_each_zone(zone)
+ calculate_zone_inactive_ratio(zone);
+}
+
/*
* Initialise min_free_kbytes.
*
@@ -2909,7 +5770,7 @@ void setup_per_zone_pages_min(void)
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
- * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
@@ -2926,65 +5787,82 @@ void setup_per_zone_pages_min(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_pages_min(void)
+int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
+ int new_min_free_kbytes;
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
-
- min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
- if (min_free_kbytes < 128)
- min_free_kbytes = 128;
- if (min_free_kbytes > 65536)
- min_free_kbytes = 65536;
- setup_per_zone_pages_min();
+ new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+
+ if (new_min_free_kbytes > user_min_free_kbytes) {
+ min_free_kbytes = new_min_free_kbytes;
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 65536)
+ min_free_kbytes = 65536;
+ } else {
+ pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+ new_min_free_kbytes, user_min_free_kbytes);
+ }
+ setup_per_zone_wmarks();
+ refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
+ setup_per_zone_inactive_ratio();
return 0;
}
-module_init(init_per_zone_pages_min)
+module_init(init_per_zone_wmark_min)
/*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
- setup_per_zone_pages_min();
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write) {
+ user_min_free_kbytes = min_free_kbytes;
+ setup_per_zone_wmarks();
+ }
return 0;
}
#ifdef CONFIG_NUMA
-int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->present_pages *
+ zone->min_unmapped_pages = (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
-int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
for_each_zone(zone)
- zone->min_slab_pages = (zone->present_pages *
+ zone->min_slab_pages = (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
@@ -2996,41 +5874,58 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
* whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
- * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
setup_per_zone_lowmem_reserve();
return 0;
}
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
*/
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
- ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
- if (!write || (ret == -EINVAL))
- return ret;
- for_each_zone(zone) {
- for_each_online_cpu(cpu) {
- unsigned long high;
- high = zone->present_pages / percpu_pagelist_fraction;
- setup_pagelist_highmark(zone_pcp(zone, cpu), high);
- }
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
}
- return 0;
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
+
+ for_each_populated_zone(zone) {
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
+ }
+out:
+ mutex_unlock(&pcp_batch_high_lock);
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;
@@ -3059,25 +5954,38 @@ void *__init alloc_large_system_hash(const char *tablename,
int flags,
unsigned int *_hash_shift,
unsigned int *_hash_mask,
- unsigned long limit)
+ unsigned long low_limit,
+ unsigned long high_limit)
{
- unsigned long long max = limit;
+ unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
- numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
- numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
- numentries >>= 20 - PAGE_SHIFT;
- numentries <<= 20 - PAGE_SHIFT;
+ numentries = nr_kernel_pages;
+
+ /* It isn't necessary when PAGE_SIZE >= 1MB */
+ if (PAGE_SHIFT < 20)
+ numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
@@ -3086,33 +5994,41 @@ void *__init alloc_large_system_hash(const char *tablename,
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
do_div(max, bucketsize);
}
+ max = min(max, 0x80000000ULL);
+ if (numentries < low_limit)
+ numentries = low_limit;
if (numentries > max)
numentries = max;
- log2qty = long_log2(numentries);
+ log2qty = ilog2(numentries);
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem(size);
+ table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
- unsigned long order;
- for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
- ;
- table = (void*) __get_free_pages(GFP_ATOMIC, order);
+ /*
+ * If bucketsize is not a power-of-two, we may free
+ * some pages at the end of hash table which
+ * alloc_pages_exact() automatically does
+ */
+ if (get_order(size) < MAX_ORDER) {
+ table = alloc_pages_exact(size, GFP_ATOMIC);
+ kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ }
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename,
- (1U << log2qty),
- long_log2(size) - PAGE_SHIFT,
+ (1UL << log2qty),
+ ilog2(size) - PAGE_SHIFT,
size);
if (_hash_shift)
@@ -3123,15 +6039,606 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page *pfn_to_page(unsigned long pfn)
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
+ unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ return __pfn_to_section(pfn)->pageblock_flags;
+#else
+ return zone->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+ pfn &= (PAGES_PER_SECTION-1);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+ pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ struct zone *zone;
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
+
+ zone = page_zone(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
+{
+ struct zone *zone;
+ unsigned long *bitmap;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+ zone = page_zone(page);
+ bitmap = get_pageblock_bitmap(zone, pfn);
+ bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
+
+ VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
+
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = ACCESS_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
+}
+
+/*
+ * This function checks whether pageblock includes unmovable pages or not.
+ * If @count is not zero, it is okay to include less @count unmovable pages
+ *
+ * PageLRU check without isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
+ */
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ bool skip_hwpoisoned_pages)
+{
+ unsigned long pfn, iter, found;
+ int mt;
+
+ /*
+ * For avoiding noise data, lru_add_drain_all() should be called
+ * If ZONE_MOVABLE, the zone never contains unmovable pages
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return false;
+ mt = get_pageblock_migratetype(page);
+ if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
+ return false;
+
+ pfn = page_to_pfn(page);
+ for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ unsigned long check = pfn + iter;
+
+ if (!pfn_valid_within(check))
+ continue;
+
+ page = pfn_to_page(check);
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * We need not scan over tail pages bacause we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page)) {
+ iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_count is zero at all time.
+ */
+ if (!atomic_read(&page->_count)) {
+ if (PageBuddy(page))
+ iter += (1 << page_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (skip_hwpoisoned_pages && PageHWPoison(page))
+ continue;
+
+ if (!PageLRU(page))
+ found++;
+ /*
+ * If there are RECLAIMABLE pages, we need to check it.
+ * But now, memory offline itself doesn't call shrink_slab()
+ * and it still to be fixed.
+ */
+ /*
+ * If the page is not RAM, page_count()should be 0.
+ * we don't need more check. This is an _used_ not-movable page.
+ *
+ * The problematic thing here is PG_reserved pages. PG_reserved
+ * is set to both of a memory hole page and a _used_ kernel
+ * page at boot.
+ */
+ if (found > count)
+ return true;
+ }
+ return false;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ * We have to take care about the node as well. If the node is offline
+ * its NODE_DATA will be NULL - see page_zone.
+ */
+ if (!node_online(page_to_nid(page)))
+ return false;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ if (!zone_spans_pfn(zone, pfn))
+ return false;
+
+ return !has_unmovable_pages(zone, page, 0, true);
+}
+
+#ifdef CONFIG_CMA
+
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+ return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages) - 1);
+}
+
+static unsigned long pfn_max_align_up(unsigned long pfn)
+{
+ return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages));
+}
+
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
+{
+ /* This function is based on compact_zone() from compaction.c. */
+ unsigned long nr_reclaimed;
+ unsigned long pfn = start;
+ unsigned int tries = 0;
+ int ret = 0;
+
+ migrate_prep();
+
+ while (pfn < end || !list_empty(&cc->migratepages)) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (list_empty(&cc->migratepages)) {
+ cc->nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc->zone, cc,
+ pfn, end, true);
+ if (!pfn) {
+ ret = -EINTR;
+ break;
+ }
+ tries = 0;
+ } else if (++tries == 5) {
+ ret = ret < 0 ? ret : -EBUSY;
+ break;
+ }
+
+ nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+ &cc->migratepages);
+ cc->nr_migratepages -= nr_reclaimed;
+
+ ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
+ NULL, 0, cc->mode, MR_CMA);
+ }
+ if (ret < 0) {
+ putback_movable_pages(&cc->migratepages);
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start: start PFN to allocate
+ * @end: one-past-the-last PFN to allocate
+ * @migratetype: migratetype of the underlaying pageblocks (either
+ * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
+ * in range must have the same migratetype and it must
+ * be either of the two.
+ *
+ * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ * aligned, however it's the caller's responsibility to guarantee that
+ * we are the only thread that changes migrate type of pageblocks the
+ * pages fall in.
+ *
+ * The PFN range must belong to a single zone.
+ *
+ * Returns zero on success or negative error code. On success all
+ * pages which PFN is in [start, end) are allocated for the caller and
+ * need to be freed with free_contig_range().
+ */
+int alloc_contig_range(unsigned long start, unsigned long end,
+ unsigned migratetype)
+{
+ unsigned long outer_start, outer_end;
+ int ret = 0, order;
+
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * What we do here is we mark all pageblocks in range as
+ * MIGRATE_ISOLATE. Because pageblock and max order pages may
+ * have different sizes, and due to the way page allocator
+ * work, we align the range to biggest of the two pages so
+ * that page allocator won't try to merge buddies from
+ * different pageblocks and change MIGRATE_ISOLATE to some
+ * other migration type.
+ *
+ * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+ * migrate the pages from an unaligned range (ie. pages that
+ * we are interested in). This will put all the pages in
+ * range back to page allocator as MIGRATE_ISOLATE.
+ *
+ * When this is done, we take the pages in range from page
+ * allocator removing them from the buddy system. This way
+ * page allocator will never consider using them.
+ *
+ * This lets us mark the pageblocks back as
+ * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+ * aligned range but not in the unaligned, original range are
+ * put back to page allocator so that buddy can use them.
+ */
+
+ ret = start_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype,
+ false);
+ if (ret)
+ return ret;
+
+ ret = __alloc_contig_migrate_range(&cc, start, end);
+ if (ret)
+ goto done;
+
+ /*
+ * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * aligned blocks that are marked as MIGRATE_ISOLATE. What's
+ * more, all pages in [start, end) are free in page allocator.
+ * What we are going to do is to allocate all pages from
+ * [start, end) (that is remove them from page allocator).
+ *
+ * The only problem is that pages at the beginning and at the
+ * end of interesting range may be not aligned with pages that
+ * page allocator holds, ie. they can be part of higher order
+ * pages. Because of this, we reserve the bigger range and
+ * once this is done free the pages we are not interested in.
+ *
+ * We don't have to hold zone->lock here because the pages are
+ * isolated thus they won't get removed from buddy.
+ */
+
+ lru_add_drain_all();
+ drain_all_pages();
+
+ order = 0;
+ outer_start = start;
+ while (!PageBuddy(pfn_to_page(outer_start))) {
+ if (++order >= MAX_ORDER) {
+ ret = -EBUSY;
+ goto done;
+ }
+ outer_start &= ~0UL << order;
+ }
+
+ /* Make sure the range is really isolated. */
+ if (test_pages_isolated(outer_start, end, false)) {
+ pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+ outer_start, end);
+ ret = -EBUSY;
+ goto done;
+ }
+
+
+ /* Grab isolated pages from freelists. */
+ outer_end = isolate_freepages_range(&cc, outer_start, end);
+ if (!outer_end) {
+ ret = -EBUSY;
+ goto done;
+ }
+
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+
+done:
+ undo_isolate_page_range(pfn_max_align_down(start),
+ pfn_max_align_up(end), migratetype);
+ return ret;
+}
+
+void free_contig_range(unsigned long pfn, unsigned nr_pages)
{
- return __pfn_to_page(pfn);
+ unsigned int count = 0;
+
+ for (; nr_pages--; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ count += page_count(page) != 1;
+ __free_page(page);
+ }
+ WARN(count != 0, "%d pages are still in use!\n", count);
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalulated.
+ */
+void __meminit zone_pcp_update(struct zone *zone)
+{
+ unsigned cpu;
+ mutex_lock(&pcp_batch_high_lock);
+ for_each_possible_cpu(cpu)
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
+ mutex_unlock(&pcp_batch_high_lock);
}
-unsigned long page_to_pfn(struct page *page)
+#endif
+
+void zone_pcp_reset(struct zone *zone)
+{
+ unsigned long flags;
+ int cpu;
+ struct per_cpu_pageset *pset;
+
+ /* avoid races with drain_pages() */
+ local_irq_save(flags);
+ if (zone->pageset != &boot_pageset) {
+ for_each_online_cpu(cpu) {
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ drain_zonestat(zone, pset);
+ }
+ free_percpu(zone->pageset);
+ zone->pageset = &boot_pageset;
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * All pages in the range must be isolated before calling this.
+ */
+void
+__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct page *page;
+ struct zone *zone;
+ unsigned int order, i;
+ unsigned long pfn;
+ unsigned long flags;
+ /* find the first valid pfn */
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ if (pfn_valid(pfn))
+ break;
+ if (pfn == end_pfn)
+ return;
+ zone = page_zone(pfn_to_page(pfn));
+ spin_lock_irqsave(&zone->lock, flags);
+ pfn = start_pfn;
+ while (pfn < end_pfn) {
+ if (!pfn_valid(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+ pfn++;
+ SetPageReserved(page);
+ continue;
+ }
+
+ BUG_ON(page_count(page));
+ BUG_ON(!PageBuddy(page));
+ order = page_order(page);
+#ifdef CONFIG_DEBUG_VM
+ printk(KERN_INFO "remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
+#endif
+ list_del(&page->lru);
+ rmv_page_order(page);
+ zone->free_area[order].nr_free--;
+ for (i = 0; i < (1 << order); i++)
+ SetPageReserved((page+i));
+ pfn += (1 << order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
+
+#ifdef CONFIG_MEMORY_FAILURE
+bool is_free_buddy_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+ if (PageBuddy(page_head) && page_order(page_head) >= order)
+ break;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return order < MAX_ORDER;
+}
+#endif
+
+static const struct trace_print_flags pageflag_names[] = {
+ {1UL << PG_locked, "locked" },
+ {1UL << PG_error, "error" },
+ {1UL << PG_referenced, "referenced" },
+ {1UL << PG_uptodate, "uptodate" },
+ {1UL << PG_dirty, "dirty" },
+ {1UL << PG_lru, "lru" },
+ {1UL << PG_active, "active" },
+ {1UL << PG_slab, "slab" },
+ {1UL << PG_owner_priv_1, "owner_priv_1" },
+ {1UL << PG_arch_1, "arch_1" },
+ {1UL << PG_reserved, "reserved" },
+ {1UL << PG_private, "private" },
+ {1UL << PG_private_2, "private_2" },
+ {1UL << PG_writeback, "writeback" },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+ {1UL << PG_head, "head" },
+ {1UL << PG_tail, "tail" },
+#else
+ {1UL << PG_compound, "compound" },
+#endif
+ {1UL << PG_swapcache, "swapcache" },
+ {1UL << PG_mappedtodisk, "mappedtodisk" },
+ {1UL << PG_reclaim, "reclaim" },
+ {1UL << PG_swapbacked, "swapbacked" },
+ {1UL << PG_unevictable, "unevictable" },
+#ifdef CONFIG_MMU
+ {1UL << PG_mlocked, "mlocked" },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ {1UL << PG_uncached, "uncached" },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ {1UL << PG_hwpoison, "hwpoison" },
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ {1UL << PG_compound_lock, "compound_lock" },
+#endif
+};
+
+static void dump_page_flags(unsigned long flags)
+{
+ const char *delim = "";
+ unsigned long mask;
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+
+ printk(KERN_ALERT "page flags: %#lx(", flags);
+
+ /* remove zone id */
+ flags &= (1UL << NR_PAGEFLAGS) - 1;
+
+ for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
+
+ mask = pageflag_names[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ flags &= ~mask;
+ printk("%s%s", delim, pageflag_names[i].name);
+ delim = "|";
+ }
+
+ /* check for left over flags */
+ if (flags)
+ printk("%s%#lx", delim, flags);
+
+ printk(")\n");
+}
+
+void dump_page_badflags(struct page *page, const char *reason,
+ unsigned long badflags)
+{
+ printk(KERN_ALERT
+ "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ page, atomic_read(&page->_count), page_mapcount(page),
+ page->mapping, page->index);
+ dump_page_flags(page->flags);
+ if (reason)
+ pr_alert("page dumped because: %s\n", reason);
+ if (page->flags & badflags) {
+ pr_alert("bad because of flags:\n");
+ dump_page_flags(page->flags & badflags);
+ }
+ mem_cgroup_print_bad_page(page);
+}
+
+void dump_page(struct page *page, const char *reason)
{
- return __page_to_pfn(page);
+ dump_page_badflags(page, reason, 0);
}
-EXPORT_SYMBOL(pfn_to_page);
-EXPORT_SYMBOL(page_to_pfn);
-#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+EXPORT_SYMBOL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 00000000000..3708264d283
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,529 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/bit_spinlock.h>
+#include <linux/page_cgroup.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/cgroup.h>
+#include <linux/swapops.h>
+#include <linux/kmemleak.h>
+
+static unsigned long total_usage;
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+ pgdat->node_page_cgroup = NULL;
+}
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long offset;
+ struct page_cgroup *base;
+
+ base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_cgroup arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (unlikely(!base))
+ return NULL;
+#endif
+ offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+ return base + offset;
+}
+
+static int __init alloc_node_page_cgroup(int nid)
+{
+ struct page_cgroup *base;
+ unsigned long table_size;
+ unsigned long nr_pages;
+
+ nr_pages = NODE_DATA(nid)->node_spanned_pages;
+ if (!nr_pages)
+ return 0;
+
+ table_size = sizeof(struct page_cgroup) * nr_pages;
+
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ if (!base)
+ return -ENOMEM;
+ NODE_DATA(nid)->node_page_cgroup = base;
+ total_usage += table_size;
+ return 0;
+}
+
+void __init page_cgroup_init_flatmem(void)
+{
+
+ int nid, fail;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ for_each_online_node(nid) {
+ fail = alloc_node_page_cgroup(nid);
+ if (fail)
+ goto fail;
+ }
+ printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+ printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
+ " don't want memory cgroups\n");
+ return;
+fail:
+ printk(KERN_CRIT "allocation of page_cgroup failed.\n");
+ printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
+ panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct mem_section *section = __pfn_to_section(pfn);
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * The sanity checks the page allocator does upon freeing a
+ * page can reach here before the page_cgroup arrays are
+ * allocated when feeding a range of pages to the allocator
+ * for the first time during bootup or memory hotplug.
+ */
+ if (!section->page_cgroup)
+ return NULL;
+#endif
+ return section->page_cgroup + pfn;
+}
+
+static void *__meminit alloc_page_cgroup(size_t size, int nid)
+{
+ gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+ void *addr = NULL;
+
+ addr = alloc_pages_exact_nid(nid, size, flags);
+ if (addr) {
+ kmemleak_alloc(addr, size, 1, flags);
+ return addr;
+ }
+
+ if (node_state(nid, N_HIGH_MEMORY))
+ addr = vzalloc_node(size, nid);
+ else
+ addr = vzalloc(size);
+
+ return addr;
+}
+
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
+{
+ struct mem_section *section;
+ struct page_cgroup *base;
+ unsigned long table_size;
+
+ section = __pfn_to_section(pfn);
+
+ if (section->page_cgroup)
+ return 0;
+
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ base = alloc_page_cgroup(table_size, nid);
+
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
+
+ if (!base) {
+ printk(KERN_ERR "page cgroup allocation failure\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * The passed "pfn" may not be aligned to SECTION. For the calculation
+ * we need to apply a mask.
+ */
+ pfn &= PAGE_SECTION_MASK;
+ section->page_cgroup = base - pfn;
+ total_usage += table_size;
+ return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_cgroup(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size =
+ sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
+ }
+}
+
+static void __free_page_cgroup(unsigned long pfn)
+{
+ struct mem_section *ms;
+ struct page_cgroup *base;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_cgroup)
+ return;
+ base = ms->page_cgroup + pfn;
+ free_page_cgroup(base);
+ ms->page_cgroup = NULL;
+}
+
+static int __meminit online_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
+{
+ unsigned long start, end, pfn;
+ int fail = 0;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ if (nid == -1) {
+ /*
+ * In this case, "nid" already exists and contains valid memory.
+ * "start_pfn" passed to us is a pfn which is an arg for
+ * online__pages(), and start_pfn should exist.
+ */
+ nid = pfn_to_nid(start_pfn);
+ VM_BUG_ON(!node_state(nid, N_ONLINE));
+ }
+
+ for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+ if (!pfn_present(pfn))
+ continue;
+ fail = init_section_page_cgroup(pfn, nid);
+ }
+ if (!fail)
+ return 0;
+
+ /* rollback */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_cgroup(pfn);
+
+ return -ENOMEM;
+}
+
+static int __meminit offline_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
+{
+ unsigned long start, end, pfn;
+
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __free_page_cgroup(pfn);
+ return 0;
+
+}
+
+static int __meminit page_cgroup_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = arg;
+ int ret = 0;
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = online_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_OFFLINE:
+ offline_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_CANCEL_ONLINE:
+ offline_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
+ case MEM_GOING_OFFLINE:
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+#endif
+
+void __init page_cgroup_init(void)
+{
+ unsigned long pfn;
+ int nid;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ for_each_node_state(nid, N_MEMORY) {
+ unsigned long start_pfn, end_pfn;
+
+ start_pfn = node_start_pfn(nid);
+ end_pfn = node_end_pfn(nid);
+ /*
+ * start_pfn and end_pfn may not be aligned to SECTION and the
+ * page->flags of out of node pages are not initialized. So we
+ * scan [start_pfn, the biggest section's pfn < end_pfn) here.
+ */
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+
+ if (!pfn_valid(pfn))
+ continue;
+ /*
+ * Nodes's pfns can be overlapping.
+ * We know some arch can have a nodes layout such as
+ * -------------pfn-------------->
+ * N0 | N1 | N2 | N0 | N1 | N2|....
+ */
+ if (pfn_to_nid(pfn) != nid)
+ continue;
+ if (init_section_page_cgroup(pfn, nid))
+ goto oom;
+ }
+ }
+ hotplug_memory_notifier(page_cgroup_callback, 0);
+ printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+ printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
+ "don't want memory cgroups\n");
+ return;
+oom:
+ printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+ panic("Out of memory");
+}
+
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+ return;
+}
+
+#endif
+
+
+#ifdef CONFIG_MEMCG_SWAP
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+ struct page **map;
+ unsigned long length;
+ spinlock_t lock;
+};
+
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+struct swap_cgroup {
+ unsigned short id;
+};
+#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ * - we have no race in "exchange" when we're accessed via SwapCache because
+ * SwapCache(and its swp_entry) is under lock.
+ * - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+ struct page *page;
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long idx, max;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ for (idx = 0; idx < ctrl->length; idx++) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto not_enough_page;
+ ctrl->map[idx] = page;
+ }
+ return 0;
+not_enough_page:
+ max = idx;
+ for (idx = 0; idx < max; idx++)
+ __free_page(ctrl->map[idx]);
+
+ return -ENOMEM;
+}
+
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+ struct swap_cgroup_ctrl **ctrlp)
+{
+ pgoff_t offset = swp_offset(ent);
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ if (ctrlp)
+ *ctrlp = ctrl;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned long flags;
+ unsigned short retval;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ retval = sc->id;
+ if (retval == old)
+ sc->id = new;
+ else
+ retval = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return retval;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned short old;
+ unsigned long flags;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ old = sc->id;
+ sc->id = id;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ return old;
+}
+
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+ return lookup_swap_cgroup(ent, NULL)->id;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ void *array;
+ unsigned long array_size;
+ unsigned long length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return 0;
+
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+ array_size = length * sizeof(void *);
+
+ array = vzalloc(array_size);
+ if (!array)
+ goto nomem;
+
+ ctrl = &swap_cgroup_ctrl[type];
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl->length = length;
+ ctrl->map = array;
+ spin_lock_init(&ctrl->lock);
+ if (swap_cgroup_prepare(type)) {
+ /* memory shortage */
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
+ goto nomem;
+ }
+ mutex_unlock(&swap_cgroup_mutex);
+
+ return 0;
+nomem:
+ printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+ printk(KERN_INFO
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+ struct page **map;
+ unsigned long i, length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return;
+
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl = &swap_cgroup_ctrl[type];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
+ if (page)
+ __free_page(page);
+ }
+ vfree(map);
+ }
+}
+
+#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index d4840ecbf8f..955db8b0d49 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,46 +12,42 @@
#include <linux/mm.h>
#include <linux/kernel_stat.h>
+#include <linux/gfp.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
+#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
+#include <linux/aio.h>
+#include <linux/blkdev.h>
#include <asm/pgtable.h>
-static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
struct bio *bio;
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- struct swap_info_struct *sis;
- swp_entry_t entry = { .val = index, };
-
- sis = get_swap_info_struct(swp_type(entry));
- bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
- (PAGE_SIZE >> 9);
- bio->bi_bdev = sis->bdev;
+ bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_idx = 0;
- bio->bi_size = PAGE_SIZE;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
}
-static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
+void end_swap_bio_write(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_size)
- return 1;
-
if (!uptodate) {
SetPageError(page);
/*
@@ -66,35 +62,167 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err)
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
- return 0;
}
-int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
+void end_swap_bio_read(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (bio->bi_size)
- return 1;
-
if (!uptodate) {
SetPageError(page);
ClearPageUptodate(page);
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
- } else {
- SetPageUptodate(page);
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto out;
}
+
+ SetPageUptodate(page);
+
+ /*
+ * There is no guarantee that the page is in swap cache - the software
+ * suspend code (at least) uses end_swap_bio_read() against a non-
+ * swapcache page. So we must check PG_swapcache before proceeding with
+ * this optimization.
+ */
+ if (likely(PageSwapCache(page))) {
+ struct swap_info_struct *sis;
+
+ sis = page_swap_info(page);
+ if (sis->flags & SWP_BLKDEV) {
+ /*
+ * The swap subsystem performs lazy swap slot freeing,
+ * expecting that the page will be swapped out again.
+ * So we can avoid an unnecessary write if the page
+ * isn't redirtied.
+ * This is good for real swap storage because we can
+ * reduce unnecessary I/O and enhance wear-leveling
+ * if an SSD is used as the as swap device.
+ * But if in-memory swap device (eg zram) is used,
+ * this causes a duplicated copy between uncompressed
+ * data in VM-owned memory and compressed data in
+ * zram-owned memory. So let's free zram-owned memory
+ * and make the VM-owned decompressed page *dirty*,
+ * so the page should be swapped out somewhere again if
+ * we again wish to reclaim it.
+ */
+ struct gendisk *disk = sis->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify) {
+ swp_entry_t entry;
+ unsigned long offset;
+
+ entry.val = page_private(page);
+ offset = swp_offset(entry);
+
+ SetPageDirty(page);
+ disk->fops->swap_slot_free_notify(sis->bdev,
+ offset);
+ }
+ }
+ }
+
+out:
unlock_page(page);
bio_put(bio);
- return 0;
+}
+
+int generic_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file,
+ sector_t *span)
+{
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned blocks_per_page;
+ unsigned long page_no;
+ unsigned blkbits;
+ sector_t probe_block;
+ sector_t last_block;
+ sector_t lowest_block = -1;
+ sector_t highest_block = 0;
+ int nr_extents = 0;
+ int ret;
+
+ blkbits = inode->i_blkbits;
+ blocks_per_page = PAGE_SIZE >> blkbits;
+
+ /*
+ * Map all the blocks into the extent list. This code doesn't try
+ * to be very smart.
+ */
+ probe_block = 0;
+ page_no = 0;
+ last_block = i_size_read(inode) >> blkbits;
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
+ unsigned block_in_page;
+ sector_t first_block;
+
+ first_block = bmap(inode, probe_block);
+ if (first_block == 0)
+ goto bad_bmap;
+
+ /*
+ * It must be PAGE_SIZE aligned on-disk
+ */
+ if (first_block & (blocks_per_page - 1)) {
+ probe_block++;
+ goto reprobe;
+ }
+
+ for (block_in_page = 1; block_in_page < blocks_per_page;
+ block_in_page++) {
+ sector_t block;
+
+ block = bmap(inode, probe_block + block_in_page);
+ if (block == 0)
+ goto bad_bmap;
+ if (block != first_block + block_in_page) {
+ /* Discontiguity */
+ probe_block++;
+ goto reprobe;
+ }
+ }
+
+ first_block >>= (PAGE_SHIFT - blkbits);
+ if (page_no) { /* exclude the header page */
+ if (first_block < lowest_block)
+ lowest_block = first_block;
+ if (first_block > highest_block)
+ highest_block = first_block;
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1, first_block);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
+ page_no++;
+ probe_block += blocks_per_page;
+reprobe:
+ continue;
+ }
+ ret = nr_extents;
+ *span = 1 + highest_block - lowest_block;
+ if (page_no == 0)
+ page_no = 1; /* force Empty message */
+ sis->max = page_no;
+ sis->pages = page_no - 1;
+ sis->highest_bit = page_no - 1;
+out:
+ return ret;
+bad_bmap:
+ printk(KERN_ERR "swapon: swapfile has holes\n");
+ ret = -EINVAL;
+ goto out;
}
/*
@@ -103,15 +231,92 @@ int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err)
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
- struct bio *bio;
- int ret = 0, rw = WRITE;
+ int ret = 0;
- if (remove_exclusive_swap_page(page)) {
+ if (try_to_free_swap(page)) {
unlock_page(page);
goto out;
}
- bio = get_swap_bio(GFP_NOIO, page_private(page), page,
- end_swap_bio_write);
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
+ ret = __swap_writepage(page, wbc, end_swap_bio_write);
+out:
+ return ret;
+}
+
+static sector_t swap_page_sector(struct page *page)
+{
+ return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+}
+
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ void (*end_write_func)(struct bio *, int))
+{
+ struct bio *bio;
+ int ret, rw = WRITE;
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ struct kiocb kiocb;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct bio_vec bv = {
+ .bv_page = page,
+ .bv_len = PAGE_SIZE,
+ .bv_offset = 0
+ };
+ struct iov_iter from = {
+ .type = ITER_BVEC | WRITE,
+ .count = PAGE_SIZE,
+ .iov_offset = 0,
+ .nr_segs = 1,
+ };
+ from.bvec = &bv; /* older gcc versions are broken */
+
+ init_sync_kiocb(&kiocb, swap_file);
+ kiocb.ki_pos = page_file_offset(page);
+ kiocb.ki_nbytes = PAGE_SIZE;
+
+ set_page_writeback(page);
+ unlock_page(page);
+ ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
+ &kiocb, &from,
+ kiocb.ki_pos);
+ if (ret == PAGE_SIZE) {
+ count_vm_event(PSWPOUT);
+ ret = 0;
+ } else {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * rotate_reclaimable_page but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
+ page_file_offset(page));
+ }
+ end_page_writeback(page);
+ return ret;
+ }
+
+ ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
+ if (!ret) {
+ count_vm_event(PSWPOUT);
+ return 0;
+ }
+
+ ret = 0;
+ bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
@@ -119,7 +324,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= (1 << BIO_RW_SYNC);
+ rw |= REQ_SYNC;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
@@ -128,15 +333,38 @@ out:
return ret;
}
-int swap_readpage(struct file *file, struct page *page)
+int swap_readpage(struct page *page)
{
struct bio *bio;
int ret = 0;
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
- BUG_ON(!PageLocked(page));
- ClearPageUptodate(page);
- bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
- end_swap_bio_read);
+ ret = mapping->a_ops->readpage(swap_file, page);
+ if (!ret)
+ count_vm_event(PSWPIN);
+ return ret;
+ }
+
+ ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ if (!ret) {
+ count_vm_event(PSWPIN);
+ return 0;
+ }
+
+ ret = 0;
+ bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
ret = -ENOMEM;
@@ -148,47 +376,14 @@ out:
return ret;
}
-#ifdef CONFIG_SOFTWARE_SUSPEND
-/*
- * A scruffy utility function to read or write an arbitrary swap page
- * and wait on the I/O. The caller must have a ref on the page.
- *
- * We use end_swap_bio_read() even for writes, because it happens to do what
- * we want.
- */
-int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page,
- struct bio **bio_chain)
+int swap_set_page_dirty(struct page *page)
{
- struct bio *bio;
- int ret = 0;
- int bio_rw;
-
- lock_page(page);
+ struct swap_info_struct *sis = page_swap_info(page);
- bio = get_swap_bio(GFP_KERNEL, entry.val, page, end_swap_bio_read);
- if (bio == NULL) {
- unlock_page(page);
- ret = -ENOMEM;
- goto out;
- }
-
- bio_rw = rw;
- if (!bio_chain)
- bio_rw |= (1 << BIO_RW_SYNC);
- if (bio_chain)
- bio_get(bio);
- submit_bio(bio_rw, bio);
- if (bio_chain == NULL) {
- wait_on_page_locked(page);
-
- if (!PageUptodate(page) || PageError(page))
- ret = -EIO;
- }
- if (bio_chain) {
- bio->bi_private = *bio_chain;
- *bio_chain = bio;
+ if (sis->flags & SWP_FILE) {
+ struct address_space *mapping = sis->swap_file->f_mapping;
+ return mapping->a_ops->set_page_dirty(page);
+ } else {
+ return __set_page_dirty_no_writeback(page);
}
-out:
- return ret;
}
-#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 00000000000..d1473b2e948
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,273 @@
+/*
+ * linux/mm/page_isolation.c
+ */
+
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include <linux/memory.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
+
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+{
+ struct zone *zone;
+ unsigned long flags, pfn;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
+ int ret = -EBUSY;
+
+ zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ pfn = page_to_pfn(page);
+ arg.start_pfn = pfn;
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
+ /*
+ * It may be possible to isolate a pageblock even if the
+ * migratetype is not MIGRATE_MOVABLE. The memory isolation
+ * notifier chain is used by balloon drivers to return the
+ * number of pages in a range that are held by the balloon
+ * driver to shrink memory. If all the pages are accounted for
+ * by balloons, are free, or on the LRU, isolation can continue.
+ * Later, for example, when memory hotplug notifier runs, these
+ * pages reported as "can be isolated" should be isolated(freed)
+ * by the balloon driver through the memory notifier chain.
+ */
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(notifier_ret);
+ if (notifier_ret)
+ goto out;
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (!has_unmovable_pages(zone, page, arg.pages_found,
+ skip_hwpoisoned_pages))
+ ret = 0;
+
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
+out:
+ if (!ret) {
+ unsigned long nr_pages;
+ int migratetype = get_pageblock_migratetype(page);
+
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+
+ __mod_zone_freepage_state(zone, -nr_pages, migratetype);
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (!ret)
+ drain_all_pages();
+ return ret;
+}
+
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+ struct zone *zone;
+ unsigned long flags, nr_pages;
+
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ goto out;
+ nr_pages = move_freepages_block(zone, page, migratetype);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
+ set_pageblock_migratetype(page, migratetype);
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+ int i;
+ for (i = 0; i < nr_pages; i++)
+ if (pfn_valid_within(pfn + i))
+ break;
+ if (unlikely(i == nr_pages))
+ return NULL;
+ return pfn_to_page(pfn + i);
+}
+
+/*
+ * start_isolate_page_range() -- make page-allocation-type of range of pages
+ * to be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ * @migratetype: migrate type to set in error recovery.
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again.
+ *
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype, bool skip_hwpoisoned_pages)
+{
+ unsigned long pfn;
+ unsigned long undo_pfn;
+ struct page *page;
+
+ BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page &&
+ set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
+ undo_pfn = pfn;
+ goto undo;
+ }
+ }
+ return 0;
+undo:
+ for (pfn = start_pfn;
+ pfn < undo_pfn;
+ pfn += pageblock_nr_pages)
+ unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+
+ return -EBUSY;
+}
+
+/*
+ * Make isolated pages available again.
+ */
+int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned migratetype)
+{
+ unsigned long pfn;
+ struct page *page;
+ BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+ for (pfn = start_pfn;
+ pfn < end_pfn;
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ continue;
+ unset_migratetype_isolate(page, migratetype);
+ }
+ return 0;
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns 1 if all pages in the range are isolated.
+ */
+static int
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
+{
+ struct page *page;
+
+ while (pfn < end_pfn) {
+ if (!pfn_valid_within(pfn)) {
+ pfn++;
+ continue;
+ }
+ page = pfn_to_page(pfn);
+ if (PageBuddy(page)) {
+ /*
+ * If race between isolatation and allocation happens,
+ * some free pages could be in MIGRATE_MOVABLE list
+ * although pageblock's migratation type of the page
+ * is MIGRATE_ISOLATE. Catch it and move the page into
+ * MIGRATE_ISOLATE list.
+ */
+ if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+ struct page *end_page;
+
+ end_page = page + (1 << page_order(page)) - 1;
+ move_freepages(page_zone(page), page, end_page,
+ MIGRATE_ISOLATE);
+ }
+ pfn += 1 << page_order(page);
+ }
+ else if (page_count(page) == 0 &&
+ get_freepage_migratetype(page) == MIGRATE_ISOLATE)
+ pfn += 1;
+ else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
+ /*
+ * The HWPoisoned page may be not in buddy
+ * system, and page_count() is not 0.
+ */
+ pfn++;
+ continue;
+ }
+ else
+ break;
+ }
+ if (pfn < end_pfn)
+ return 0;
+ return 1;
+}
+
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
+{
+ unsigned long pfn, flags;
+ struct page *page;
+ struct zone *zone;
+ int ret;
+
+ /*
+ * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
+ * are not aligned to pageblock_nr_pages.
+ * Then we just check migratetype first.
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ break;
+ }
+ page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+ if ((pfn < end_pfn) || !page)
+ return -EBUSY;
+ /* Check all pages are free or marked as ISOLATED */
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ skip_hwpoisoned_pages);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return ret ? 0 : -EBUSY;
+}
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+ int **resultp)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ /*
+ * TODO: allocate a destination hugepage from a nearest neighbor node,
+ * accordance with memory policy of the user process if possible. For
+ * now as a simple work-around, we use the next node for destination.
+ */
+ if (PageHuge(page)) {
+ nodemask_t src = nodemask_of_node(page_to_nid(page));
+ nodemask_t dst;
+ nodes_complement(dst, src);
+ return alloc_huge_page_node(page_hstate(compound_head(page)),
+ next_node(page_to_nid(page), dst));
+ }
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
+}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 00000000000..2beeabf502c
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,248 @@
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+
+ pte = pte_offset_map(pmd, addr);
+ for (;;) {
+ err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ if (err)
+ break;
+ addr += PAGE_SIZE;
+ if (addr == end)
+ break;
+ pte++;
+ }
+
+ pte_unmap(pte);
+ return err;
+}
+
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err = 0;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+again:
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(*pmd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ /*
+ * This implies that each ->pmd_entry() handler
+ * needs to know about pmd_trans_huge() pmds
+ */
+ if (walk->pmd_entry)
+ err = walk->pmd_entry(pmd, addr, next, walk);
+ if (err)
+ break;
+
+ /*
+ * Check this here so we only break down trans_huge
+ * pages when we _need_ to
+ */
+ if (!walk->pte_entry)
+ continue;
+
+ split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ goto again;
+ err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+
+ return err;
+}
+
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err = 0;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pud_entry)
+ err = walk->pud_entry(pud, addr, next, walk);
+ if (!err && (walk->pmd_entry || walk->pte_entry))
+ err = walk_pmd_range(pud, addr, next, walk);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+
+ return err;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+ unsigned long end)
+{
+ unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+ return boundary < end ? boundary : end;
+}
+
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long next;
+ unsigned long hmask = huge_page_mask(h);
+ pte_t *pte;
+ int err = 0;
+
+ do {
+ next = hugetlb_entry_end(h, addr, end);
+ pte = huge_pte_offset(walk->mm, addr & hmask);
+ if (pte && walk->hugetlb_entry)
+ err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+ if (err)
+ return err;
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+#endif /* CONFIG_HUGETLB_PAGE */
+
+
+
+/**
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @addr: starting address
+ * @end: ending address
+ * @walk: set of callbacks to invoke for each level of the tree
+ *
+ * Recursively walk the page table for the memory area in a VMA,
+ * calling supplied callbacks. Callbacks are called in-order (first
+ * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ *
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
+ *
+ * Usually no locks are taken, but splitting transparent huge page may
+ * take page table lock. And the bottom level iterator will map PTE
+ * directories from highmem if necessary.
+ *
+ * If any callback returns a non-zero value, the walk is aborted and
+ * the return value is propagated back to the caller. Otherwise 0 is returned.
+ *
+ * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
+ * is !NULL.
+ */
+int walk_page_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ if (addr >= end)
+ return err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ struct vm_area_struct *vma = NULL;
+
+ next = pgd_addr_end(addr, end);
+
+ /*
+ * This function was not intended to be vma based.
+ * But there are vma special cases to be handled:
+ * - hugetlb vma's
+ * - VM_PFNMAP vma's
+ */
+ vma = find_vma(walk->mm, addr);
+ if (vma) {
+ /*
+ * There are no page structures backing a VM_PFNMAP
+ * range, so do not allow split_huge_page_pmd().
+ */
+ if ((vma->vm_start <= addr) &&
+ (vma->vm_flags & VM_PFNMAP)) {
+ next = vma->vm_end;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
+ /*
+ * Handle hugetlb vma individually because pagetable
+ * walk for the hugetlb page is dependent on the
+ * architecture and we can't handled it in the same
+ * manner as non-huge pages.
+ */
+ if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+ is_vm_hugetlb_page(vma)) {
+ if (vma->vm_end < next)
+ next = vma->vm_end;
+ /*
+ * Hugepage is very tightly coupled with vma,
+ * so walk through hugetlb entries within a
+ * given vma.
+ */
+ err = walk_hugetlb_range(vma, addr, next, walk);
+ if (err)
+ break;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
+ }
+
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ pgd++;
+ continue;
+ }
+ if (walk->pgd_entry)
+ err = walk->pgd_entry(pgd, addr, next, walk);
+ if (!err &&
+ (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ pgd++;
+ } while (addr = next, addr < end);
+
+ return err;
+}
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index b02102feeb4..00000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * mm/pdflush.c - worker threads for writing back filesystem data
- *
- * Copyright (C) 2002, Linus Torvalds.
- *
- * 09Apr2002 akpm@zip.com.au
- * Initial version
- * 29Feb2004 kaos@sgi.com
- * Move worker thread creation to kthread to avoid chewing
- * up stack space with nested calls to kernel_thread.
- */
-
-#include <linux/sched.h>
-#include <linux/list.h>
-#include <linux/signal.h>
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h> // Needed by writeback.h
-#include <linux/writeback.h> // Prototypes pdflush_operation()
-#include <linux/kthread.h>
-#include <linux/cpuset.h>
-
-
-/*
- * Minimum and maximum number of pdflush instances
- */
-#define MIN_PDFLUSH_THREADS 2
-#define MAX_PDFLUSH_THREADS 8
-
-static void start_one_pdflush_thread(void);
-
-
-/*
- * The pdflush threads are worker threads for writing back dirty data.
- * Ideally, we'd like one thread per active disk spindle. But the disk
- * topology is very hard to divine at this level. Instead, we take
- * care in various places to prevent more than one pdflush thread from
- * performing writeback against a single filesystem. pdflush threads
- * have the PF_FLUSHER flag set in current->flags to aid in this.
- */
-
-/*
- * All the pdflush threads. Protected by pdflush_lock
- */
-static LIST_HEAD(pdflush_list);
-static DEFINE_SPINLOCK(pdflush_lock);
-
-/*
- * The count of currently-running pdflush threads. Protected
- * by pdflush_lock.
- *
- * Readable by sysctl, but not writable. Published to userspace at
- * /proc/sys/vm/nr_pdflush_threads.
- */
-int nr_pdflush_threads = 0;
-
-/*
- * The time at which the pdflush thread pool last went empty
- */
-static unsigned long last_empty_jifs;
-
-/*
- * The pdflush thread.
- *
- * Thread pool management algorithm:
- *
- * - The minimum and maximum number of pdflush instances are bound
- * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
- *
- * - If there have been no idle pdflush instances for 1 second, create
- * a new one.
- *
- * - If the least-recently-went-to-sleep pdflush thread has been asleep
- * for more than one second, terminate a thread.
- */
-
-/*
- * A structure for passing work to a pdflush thread. Also for passing
- * state information between pdflush threads. Protected by pdflush_lock.
- */
-struct pdflush_work {
- struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
- unsigned long arg0; /* An argument to the callback */
- struct list_head list; /* On pdflush_list, when idle */
- unsigned long when_i_went_to_sleep;
-};
-
-static int __pdflush(struct pdflush_work *my_work)
-{
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
- my_work->fn = NULL;
- my_work->who = current;
- INIT_LIST_HEAD(&my_work->list);
-
- spin_lock_irq(&pdflush_lock);
- nr_pdflush_threads++;
- for ( ; ; ) {
- struct pdflush_work *pdf;
-
- set_current_state(TASK_INTERRUPTIBLE);
- list_move(&my_work->list, &pdflush_list);
- my_work->when_i_went_to_sleep = jiffies;
- spin_unlock_irq(&pdflush_lock);
- schedule();
- try_to_freeze();
- spin_lock_irq(&pdflush_lock);
- if (!list_empty(&my_work->list)) {
- /*
- * Someone woke us up, but without removing our control
- * structure from the global list. swsusp will do this
- * in try_to_freeze()->refrigerator(). Handle it.
- */
- my_work->fn = NULL;
- continue;
- }
- if (my_work->fn == NULL) {
- printk("pdflush: bogus wakeup\n");
- continue;
- }
- spin_unlock_irq(&pdflush_lock);
-
- (*my_work->fn)(my_work->arg0);
-
- /*
- * Thread creation: For how long have there been zero
- * available threads?
- */
- if (jiffies - last_empty_jifs > 1 * HZ) {
- /* unlocked list_empty() test is OK here */
- if (list_empty(&pdflush_list)) {
- /* unlocked test is OK here */
- if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
- start_one_pdflush_thread();
- }
- }
-
- spin_lock_irq(&pdflush_lock);
- my_work->fn = NULL;
-
- /*
- * Thread destruction: For how long has the sleepiest
- * thread slept?
- */
- if (list_empty(&pdflush_list))
- continue;
- if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
- continue;
- pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
- if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
- /* Limit exit rate */
- pdf->when_i_went_to_sleep = jiffies;
- break; /* exeunt */
- }
- }
- nr_pdflush_threads--;
- spin_unlock_irq(&pdflush_lock);
- return 0;
-}
-
-/*
- * Of course, my_work wants to be just a local in __pdflush(). It is
- * separated out in this manner to hopefully prevent the compiler from
- * performing unfortunate optimisations against the auto variables. Because
- * these are visible to other tasks and CPUs. (No problem has actually
- * been observed. This is just paranoia).
- */
-static int pdflush(void *dummy)
-{
- struct pdflush_work my_work;
- cpumask_t cpus_allowed;
-
- /*
- * pdflush can spend a lot of time doing encryption via dm-crypt. We
- * don't want to do that at keventd's priority.
- */
- set_user_nice(current, 0);
-
- /*
- * Some configs put our parent kthread in a limited cpuset,
- * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
- * Our needs are more modest - cut back to our cpusets cpus_allowed.
- * This is needed as pdflush's are dynamically created and destroyed.
- * The boottime pdflush's are easily placed w/o these 2 lines.
- */
- cpus_allowed = cpuset_cpus_allowed(current);
- set_cpus_allowed(current, cpus_allowed);
-
- return __pdflush(&my_work);
-}
-
-/*
- * Attempt to wake up a pdflush thread, and get it to do some work for you.
- * Returns zero if it indeed managed to find a worker thread, and passed your
- * payload to it.
- */
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
-{
- unsigned long flags;
- int ret = 0;
-
- BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
-
- spin_lock_irqsave(&pdflush_lock, flags);
- if (list_empty(&pdflush_list)) {
- spin_unlock_irqrestore(&pdflush_lock, flags);
- ret = -1;
- } else {
- struct pdflush_work *pdf;
-
- pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
- list_del_init(&pdf->list);
- if (list_empty(&pdflush_list))
- last_empty_jifs = jiffies;
- pdf->fn = fn;
- pdf->arg0 = arg0;
- wake_up_process(pdf->who);
- spin_unlock_irqrestore(&pdflush_lock, flags);
- }
- return ret;
-}
-
-static void start_one_pdflush_thread(void)
-{
- kthread_run(pdflush, NULL, "pdflush");
-}
-
-static int __init pdflush_init(void)
-{
- int i;
-
- for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
- start_one_pdflush_thread();
- return 0;
-}
-
-module_init(pdflush_init);
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 00000000000..89633fefc6a
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,108 @@
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation. This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
+ * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
+ * fine.
+ *
+ * - NUMA is not supported. When setting up the first chunk,
+ * @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ * than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ * PAGE_SIZE. Because each chunk is allocated as a contiguous
+ * kernel memory block using alloc_pages(), memory will be wasted if
+ * chunk size is not aligned. percpu-km code will whine about it.
+ */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+
+#include <linux/log2.h>
+
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
+ return 0;
+}
+
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ /* nada */
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+ struct pcpu_chunk *chunk;
+ struct page *pages;
+ int i;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+ if (!pages) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ for (i = 0; i < nr_pages; i++)
+ pcpu_set_page_chunk(nth_page(pages, i), chunk);
+
+ chunk->data = pages;
+ chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+
+ if (chunk && chunk->data)
+ __free_pages(chunk->data, order_base_2(nr_pages));
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return virt_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ size_t nr_pages, alloc_pages;
+
+ /* all units must be in a single group */
+ if (ai->nr_groups != 1) {
+ printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ return -EINVAL;
+ }
+
+ nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+ alloc_pages = roundup_pow_of_two(nr_pages);
+
+ if (alloc_pages > nr_pages)
+ printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
+
+ return 0;
+}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 00000000000..3707c71ae4c
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,448 @@
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ /* must not be used on pre-mapped chunk */
+ WARN_ON(chunk->immutable);
+
+ return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+
+/**
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx(). The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated. Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+ unsigned long **bitmapp,
+ bool may_alloc)
+{
+ static struct page **pages;
+ static unsigned long *bitmap;
+ size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+ size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+ sizeof(unsigned long);
+
+ if (!pages || !bitmap) {
+ if (may_alloc && !pages)
+ pages = pcpu_mem_zalloc(pages_size);
+ if (may_alloc && !bitmap)
+ bitmap = pcpu_mem_zalloc(bitmap_size);
+ if (!pages || !bitmap)
+ return NULL;
+ }
+
+ bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+
+ *bitmapp = bitmap;
+ return pages;
+}
+
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page = pages[pcpu_page_idx(cpu, i)];
+
+ if (page)
+ __free_page(page);
+ }
+ }
+}
+
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk. Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+ if (!*pagep) {
+ pcpu_free_pages(chunk, pages, populated,
+ page_start, page_end);
+ return -ENOMEM;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped. Flush cache. As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu. This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vunmap(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+ unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished. The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ for (i = page_start; i < page_end; i++) {
+ struct page *page;
+
+ page = pcpu_chunk_page(chunk, cpu, i);
+ WARN_ON(!page);
+ pages[pcpu_page_idx(cpu, i)] = page;
+ }
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ page_end - page_start);
+ }
+
+ bitmap_clear(populated, page_start, page_end - page_start);
+}
+
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
+ * TLB for the regions. This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_tlb_kernel_range(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+ int nr_pages)
+{
+ return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+ PAGE_KERNEL, pages);
+}
+
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk. The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+ struct page **pages, unsigned long *populated,
+ int page_start, int page_end)
+{
+ unsigned int cpu, tcpu;
+ int i, err;
+
+ for_each_possible_cpu(cpu) {
+ err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+ &pages[pcpu_page_idx(cpu, page_start)],
+ page_end - page_start);
+ if (err < 0)
+ goto err;
+ }
+
+ /* mapping successful, link chunk and mark populated */
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu)
+ pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+ chunk);
+ __set_bit(i, populated);
+ }
+
+ return 0;
+
+err:
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+ page_end - page_start);
+ }
+ return err;
+}
+
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ flush_cache_vmap(
+ pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
+ pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk. The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ int free_end = page_start, unmap_end = page_start;
+ struct page **pages;
+ unsigned long *populated;
+ unsigned int cpu;
+ int rs, re, rc;
+
+ /* quick path, check whether all pages are already there */
+ rs = page_start;
+ pcpu_next_pop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ goto clear;
+
+ /* need to allocate and map pages, this chunk can't be immutable */
+ WARN_ON(chunk->immutable);
+
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ if (!pages)
+ return -ENOMEM;
+
+ /* alloc and map */
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_free;
+ free_end = re;
+ }
+
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+ if (rc)
+ goto err_unmap;
+ unmap_end = re;
+ }
+ pcpu_post_map_flush(chunk, page_start, page_end);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+ return 0;
+
+err_unmap:
+ pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+ return rc;
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ struct page **pages;
+ unsigned long *populated;
+ int rs, re;
+
+ /* quick path, check whether it's empty already */
+ rs = page_start;
+ pcpu_next_unpop(chunk, &rs, &re, page_end);
+ if (rs == page_start && re == page_end)
+ return;
+
+ /* immutable chunks can't be depopulated */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * If control reaches here, there must have been at least one
+ * successful population attempt so the temp pages array must
+ * be available now.
+ */
+ pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ BUG_ON(!pages);
+
+ /* unmap and free */
+ pcpu_pre_unmap_flush(chunk, page_start, page_end);
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_unmap_pages(chunk, pages, populated, rs, re);
+
+ /* no need to flush tlb, vmalloc will handle it lazily */
+
+ pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+ pcpu_free_pages(chunk, pages, populated, rs, re);
+
+ /* commit new bitmap */
+ bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+}
+
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+ struct pcpu_chunk *chunk;
+ struct vm_struct **vms;
+
+ chunk = pcpu_alloc_chunk();
+ if (!chunk)
+ return NULL;
+
+ vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+ pcpu_nr_groups, pcpu_atom_size);
+ if (!vms) {
+ pcpu_free_chunk(chunk);
+ return NULL;
+ }
+
+ chunk->data = vms;
+ chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+ return chunk;
+}
+
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+ if (chunk && chunk->data)
+ pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+ pcpu_free_chunk(chunk);
+}
+
+static struct page *pcpu_addr_to_page(void *addr)
+{
+ return vmalloc_to_page(addr);
+}
+
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+ /* no extra restriction */
+ return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 00000000000..2ddf9a990db
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,1968 @@
+/*
+ * mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009 SUSE Linux Products GmbH
+ * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas. Percpu areas are allocated in chunks. Each chunk is
+ * consisted of boot-time determined number of units and the first
+ * chunk is used for static percpu variables in the kernel image
+ * (special boot time alloc/init handling necessary as these areas
+ * need to be brought up before allocation services are running).
+ * Unit grows as necessary and all units grow or shrink in unison.
+ * When a chunk is filled up, another chunk is allocated.
+ *
+ * c0 c1 c2
+ * ------------------- ------------------- ------------
+ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
+ * ------------------- ...... ------------------- .... ------------
+ *
+ * Allocation is done in offset-size areas of single unit space. Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
+ * cpus. On NUMA, the mapping can be non-linear and even sparse.
+ * Percpu access can be done by configuring percpu base registers
+ * according to cpu to unit mapping and pcpu_unit_size.
+ *
+ * There are usually many small percpu allocations many of them being
+ * as small as 4 bytes. The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be equal to or larger than the maximum contiguous
+ * area in the chunk. This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map. A positive value in the map represents a free
+ * region and negative allocated. Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry. This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ * regular address to percpu pointer and back if they need to be
+ * different from the default
+ *
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ * setup the first chunk containing the kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/kmemleak.h>
+
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+
+#ifdef CONFIG_SMP
+/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
+#ifndef __addr_to_pcpu_ptr
+#define __addr_to_pcpu_ptr(addr) \
+ (void __percpu *)((unsigned long)(addr) - \
+ (unsigned long)pcpu_base_addr + \
+ (unsigned long)__per_cpu_start)
+#endif
+#ifndef __pcpu_ptr_to_addr
+#define __pcpu_ptr_to_addr(ptr) \
+ (void __force *)((unsigned long)(ptr) + \
+ (unsigned long)pcpu_base_addr - \
+ (unsigned long)__per_cpu_start)
+#endif
+#else /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
+#endif /* CONFIG_SMP */
+
+struct pcpu_chunk {
+ struct list_head list; /* linked to pcpu_slot lists */
+ int free_size; /* free bytes in the chunk */
+ int contig_hint; /* max contiguous size hint */
+ void *base_addr; /* base address of this chunk */
+ int map_used; /* # of map entries used before the sentry */
+ int map_alloc; /* # of map entries allocated */
+ int *map; /* allocation map */
+ void *data; /* chunk data */
+ int first_free; /* no free below this */
+ bool immutable; /* no [de]population allowed */
+ unsigned long populated[]; /* populated bitmap */
+};
+
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_nr_units __read_mostly;
+static int pcpu_atom_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
+
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr __read_mostly;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
+
+/* group information, used for vm allocation */
+static int pcpu_nr_groups __read_mostly;
+static const unsigned long *pcpu_group_offsets __read_mostly;
+static const size_t *pcpu_group_sizes __read_mostly;
+
+/*
+ * The first chunk which always exists. Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk. This chunk reserves part of the first
+ * chunk and serves it for reserved allocations. The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit. When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+static int pcpu_reserved_chunk_limit;
+
+/*
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
+ * protects allocation/reclaim paths, chunks, populated bitmap and
+ * vmalloc mapping. The latter is a spinlock and protects the index
+ * data structures - chunk slots, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary. All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released. In
+ * general, percpu memory can't be allocated with irq off but
+ * irqsave/restore are still used in alloc path so that it can be used
+ * from early init path - sched_init() specifically.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context. When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks. Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
+ */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
+
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
+static bool pcpu_addr_in_first_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start && addr < first_start + pcpu_unit_size;
+}
+
+static bool pcpu_addr_in_reserved_chunk(void *addr)
+{
+ void *first_start = pcpu_first_chunk->base_addr;
+
+ return addr >= first_start &&
+ addr < first_start + pcpu_reserved_chunk_limit;
+}
+
+static int __pcpu_size_to_slot(int size)
+{
+ int highbit = fls(size); /* size is in bytes */
+ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_size_to_slot(int size)
+{
+ if (size == pcpu_unit_size)
+ return pcpu_nr_slots - 1;
+ return __pcpu_size_to_slot(size);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+ if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ return 0;
+
+ return pcpu_size_to_slot(chunk->free_size);
+}
+
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+ page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+ return (struct pcpu_chunk *)page->index;
+}
+
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+ (page_idx << PAGE_SHIFT);
+}
+
+static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
+{
+ *rs = find_next_zero_bit(chunk->populated, end, *rs);
+ *re = find_next_bit(chunk->populated, end, *rs + 1);
+}
+
+static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
+ int *rs, int *re, int end)
+{
+ *rs = find_next_bit(chunk->populated, end, *rs);
+ *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+}
+
+/*
+ * (Un)populated page region iterators. Iterate over (un)populated
+ * page regions between @start and @end in @chunk. @rs and @re should
+ * be integer variables and will be set to start and end page index of
+ * the current region.
+ */
+#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+
+/**
+ * pcpu_mem_zalloc - allocate memory
+ * @size: bytes to allocate
+ *
+ * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vzalloc() is used. The returned
+ * memory is always zeroed.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void *pcpu_mem_zalloc(size_t size)
+{
+ if (WARN_ON_ONCE(!slab_is_available()))
+ return NULL;
+
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_KERNEL);
+ else
+ return vzalloc(size);
+}
+
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
+ if (size <= PAGE_SIZE)
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot. Note that the reserved chunk is never put on
+ * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+ int nslot = pcpu_chunk_slot(chunk);
+
+ if (chunk != pcpu_reserved_chunk && oslot != nslot) {
+ if (oslot < nslot)
+ list_move(&chunk->list, &pcpu_slot[nslot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+ }
+}
+
+/**
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * @chunk: chunk of interest
+ *
+ * Determine whether area map of @chunk needs to be extended to
+ * accommodate a new allocation.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ *
+ * RETURNS:
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
+ */
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
+{
+ int new_alloc;
+
+ if (chunk->map_alloc >= chunk->map_used + 3)
+ return 0;
+
+ new_alloc = PCPU_DFL_MAP_ALLOC;
+ while (new_alloc < chunk->map_used + 3)
+ new_alloc *= 2;
+
+ return new_alloc;
+}
+
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+ int *old = NULL, *new = NULL;
+ size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+ unsigned long flags;
+
+ new = pcpu_mem_zalloc(new_size);
+ if (!new)
+ return -ENOMEM;
+
+ /* acquire pcpu_lock and switch to new area map */
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ if (new_alloc <= chunk->map_alloc)
+ goto out_unlock;
+
+ old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+ old = chunk->map;
+
+ memcpy(new, old, old_size);
+
+ chunk->map_alloc = new_alloc;
+ chunk->map = new;
+ new = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /*
+ * pcpu_mem_free() might end up calling vfree() which uses
+ * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ */
+ pcpu_mem_free(old, old_size);
+ pcpu_mem_free(new, new_size);
+
+ return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size in bytes
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset. It doesn't
+ * populate or map the area.
+ *
+ * @chunk->map must have at least two free slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+ int oslot = pcpu_chunk_slot(chunk);
+ int max_contig = 0;
+ int i, off;
+ bool seen_free = false;
+ int *p;
+
+ for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
+ int head, tail;
+ int this_size;
+
+ off = *p;
+ if (off & 1)
+ continue;
+
+ /* extra for alignment requirement */
+ head = ALIGN(off, align) - off;
+
+ this_size = (p[1] & ~1) - off;
+ if (this_size < head + size) {
+ if (!seen_free) {
+ chunk->first_free = i;
+ seen_free = true;
+ }
+ max_contig = max(this_size, max_contig);
+ continue;
+ }
+
+ /*
+ * If head is small or the previous block is free,
+ * merge'em. Note that 'small' is defined as smaller
+ * than sizeof(int), which is very small but isn't too
+ * uncommon for percpu allocations.
+ */
+ if (head && (head < sizeof(int) || !(p[-1] & 1))) {
+ *p = off += head;
+ if (p[-1] & 1)
+ chunk->free_size -= head;
+ else
+ max_contig = max(*p - p[-1], max_contig);
+ this_size -= head;
+ head = 0;
+ }
+
+ /* if tail is small, just keep it around */
+ tail = this_size - head - size;
+ if (tail < sizeof(int)) {
+ tail = 0;
+ size = this_size - head;
+ }
+
+ /* split if warranted */
+ if (head || tail) {
+ int nr_extra = !!head + !!tail;
+
+ /* insert new subblocks */
+ memmove(p + nr_extra + 1, p + 1,
+ sizeof(chunk->map[0]) * (chunk->map_used - i));
+ chunk->map_used += nr_extra;
+
+ if (head) {
+ if (!seen_free) {
+ chunk->first_free = i;
+ seen_free = true;
+ }
+ *++p = off += head;
+ ++i;
+ max_contig = max(head, max_contig);
+ }
+ if (tail) {
+ p[1] = off + size;
+ max_contig = max(tail, max_contig);
+ }
+ }
+
+ if (!seen_free)
+ chunk->first_free = i + 1;
+
+ /* update hint and mark allocated */
+ if (i + 1 == chunk->map_used)
+ chunk->contig_hint = max_contig; /* fully scanned */
+ else
+ chunk->contig_hint = max(chunk->contig_hint,
+ max_contig);
+
+ chunk->free_size -= size;
+ *p |= 1;
+
+ pcpu_chunk_relocate(chunk, oslot);
+ return off;
+ }
+
+ chunk->contig_hint = max_contig; /* fully scanned */
+ pcpu_chunk_relocate(chunk, oslot);
+
+ /* tell the upper layer that this chunk has no matching area */
+ return -1;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk. Note that this function
+ * only modifies the allocation map. It doesn't depopulate or unmap
+ * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+ int oslot = pcpu_chunk_slot(chunk);
+ int off = 0;
+ unsigned i, j;
+ int to_free = 0;
+ int *p;
+
+ freeme |= 1; /* we are searching for <given offset, in use> pair */
+
+ i = 0;
+ j = chunk->map_used;
+ while (i != j) {
+ unsigned k = (i + j) / 2;
+ off = chunk->map[k];
+ if (off < freeme)
+ i = k + 1;
+ else if (off > freeme)
+ j = k;
+ else
+ i = j = k;
+ }
+ BUG_ON(off != freeme);
+
+ if (i < chunk->first_free)
+ chunk->first_free = i;
+
+ p = chunk->map + i;
+ *p = off &= ~1;
+ chunk->free_size += (p[1] & ~1) - off;
+
+ /* merge with next? */
+ if (!(p[1] & 1))
+ to_free++;
+ /* merge with previous? */
+ if (i > 0 && !(p[-1] & 1)) {
+ to_free++;
+ i--;
+ p--;
+ }
+ if (to_free) {
+ chunk->map_used -= to_free;
+ memmove(p + 1, p + 1 + to_free,
+ (chunk->map_used - i) * sizeof(chunk->map[0]));
+ }
+
+ chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
+ pcpu_chunk_relocate(chunk, oslot);
+}
+
+static struct pcpu_chunk *pcpu_alloc_chunk(void)
+{
+ struct pcpu_chunk *chunk;
+
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+ if (!chunk)
+ return NULL;
+
+ chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
+ sizeof(chunk->map[0]));
+ if (!chunk->map) {
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ return NULL;
+ }
+
+ chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+ chunk->map[0] = 0;
+ chunk->map[1] = pcpu_unit_size | 1;
+ chunk->map_used = 1;
+
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->free_size = pcpu_unit_size;
+ chunk->contig_hint = pcpu_unit_size;
+
+ return chunk;
+}
+
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
+{
+ if (!chunk)
+ return;
+ pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+}
+
+/*
+ * Chunk management implementation.
+ *
+ * To allow different implementations, chunk alloc/free and
+ * [de]population are implemented in a separate file which is pulled
+ * into this file and compiled together. The following functions
+ * should be implemented.
+ *
+ * pcpu_populate_chunk - populate the specified range of a chunk
+ * pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_create_chunk - create a new chunk
+ * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
+ * pcpu_addr_to_page - translate address to physical address
+ * pcpu_verify_alloc_info - check alloc_info is acceptable during init
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
+static struct pcpu_chunk *pcpu_create_chunk(void);
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
+static struct page *pcpu_addr_to_page(void *addr);
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
+
+#ifdef CONFIG_NEED_PER_CPU_KM
+#include "percpu-km.c"
+#else
+#include "percpu-vm.c"
+#endif
+
+/**
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+ /* is it in the first chunk? */
+ if (pcpu_addr_in_first_chunk(addr)) {
+ /* is it in the reserved area? */
+ if (pcpu_addr_in_reserved_chunk(addr))
+ return pcpu_reserved_chunk;
+ return pcpu_first_chunk;
+ }
+
+ /*
+ * The address is relative to unit0 which might be unused and
+ * thus unmapped. Offset the address to the unit space of the
+ * current processor before looking it up in the vmalloc
+ * space. Note that any possible cpu id can be used here, so
+ * there's no need to worry about preemption or cpu hotplug.
+ */
+ addr += pcpu_unit_offsets[raw_smp_processor_id()];
+ return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
+}
+
+/**
+ * pcpu_alloc - the percpu allocator
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
+ *
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
+{
+ static int warn_limit = 10;
+ struct pcpu_chunk *chunk;
+ const char *err;
+ int slot, off, new_alloc;
+ unsigned long flags;
+ void __percpu *ptr;
+
+ /*
+ * We want the lowest bit of offset available for in-use/free
+ * indicator, so force >= 16bit alignment and make size even.
+ */
+ if (unlikely(align < 2))
+ align = 2;
+
+ if (unlikely(size & 1))
+ size++;
+
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+ WARN(true, "illegal size (%zu) or align (%zu) for "
+ "percpu allocation\n", size, align);
+ return NULL;
+ }
+
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ /* serve reserved allocations from the reserved chunk if available */
+ if (reserved && pcpu_reserved_chunk) {
+ chunk = pcpu_reserved_chunk;
+
+ if (size > chunk->contig_hint) {
+ err = "alloc from reserved chunk failed";
+ goto fail_unlock;
+ }
+
+ while ((new_alloc = pcpu_need_to_extend(chunk))) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+ err = "failed to extend area map of reserved chunk";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ }
+
+ off = pcpu_alloc_area(chunk, size, align);
+ if (off >= 0)
+ goto area_found;
+
+ err = "alloc from reserved chunk failed";
+ goto fail_unlock;
+ }
+
+restart:
+ /* search through normal chunks */
+ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ if (size > chunk->contig_hint)
+ continue;
+
+ new_alloc = pcpu_need_to_extend(chunk);
+ if (new_alloc) {
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ if (pcpu_extend_area_map(chunk,
+ new_alloc) < 0) {
+ err = "failed to extend area map";
+ goto fail_unlock_mutex;
+ }
+ spin_lock_irqsave(&pcpu_lock, flags);
+ /*
+ * pcpu_lock has been dropped, need to
+ * restart cpu_slot list walking.
+ */
+ goto restart;
+ }
+
+ off = pcpu_alloc_area(chunk, size, align);
+ if (off >= 0)
+ goto area_found;
+ }
+ }
+
+ /* hmmm... no space left, create a new chunk */
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ chunk = pcpu_create_chunk();
+ if (!chunk) {
+ err = "failed to allocate new chunk";
+ goto fail_unlock_mutex;
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_relocate(chunk, -1);
+ goto restart;
+
+area_found:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ /* populate, map and clear the area */
+ if (pcpu_populate_chunk(chunk, off, size)) {
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_free_area(chunk, off);
+ err = "failed to populate";
+ goto fail_unlock;
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
+
+ /* return address relative to base address */
+ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
+ kmemleak_alloc_percpu(ptr, size);
+ return ptr;
+
+fail_unlock:
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+fail_unlock_mutex:
+ mutex_unlock(&pcpu_alloc_mutex);
+ if (warn_limit) {
+ pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
+ "%s\n", size, align, err);
+ dump_stack();
+ if (!--warn_limit)
+ pr_info("PERCPU: limit reached, disable warning\n");
+ }
+ return NULL;
+}
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Might sleep. Might trigger writeouts.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void __percpu *__alloc_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, false);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area. Might sleep.
+ * Might trigger writeouts.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
+{
+ return pcpu_alloc(size, align, true);
+}
+
+/**
+ * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * @work: unused
+ *
+ * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
+ */
+static void pcpu_reclaim(struct work_struct *work)
+{
+ LIST_HEAD(todo);
+ struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+ struct pcpu_chunk *chunk, *next;
+
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
+
+ list_for_each_entry_safe(chunk, next, head, list) {
+ WARN_ON(chunk->immutable);
+
+ /* spare the first one */
+ if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+ continue;
+
+ list_move(&chunk->list, &todo);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+
+ list_for_each_entry_safe(chunk, next, &todo, list) {
+ pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
+ pcpu_destroy_chunk(chunk);
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
+ */
+void free_percpu(void __percpu *ptr)
+{
+ void *addr;
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int off;
+
+ if (!ptr)
+ return;
+
+ kmemleak_free_percpu(ptr);
+
+ addr = __pcpu_ptr_to_addr(ptr);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+
+ chunk = pcpu_chunk_addr_search(addr);
+ off = addr - chunk->base_addr;
+
+ pcpu_free_area(chunk, off);
+
+ /* if there are more than one fully free chunks, wake up grim reaper */
+ if (chunk->free_size == pcpu_unit_size) {
+ struct pcpu_chunk *pos;
+
+ list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+ if (pos != chunk) {
+ schedule_work(&pcpu_reclaim_work);
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+#ifdef CONFIG_SMP
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if ((void *)addr >= start && (void *)addr < start + static_size)
+ return true;
+ }
+#endif
+ /* on UP, can't distinguish from other static vars, always false */
+ return false;
+}
+
+/**
+ * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+ * @addr: the address to be converted to physical address
+ *
+ * Given @addr which is dereferenceable address obtained via one of
+ * percpu access macros, this function translates it into its physical
+ * address. The caller is responsible for ensuring @addr stays valid
+ * until this function finishes.
+ *
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be tranlated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
+ * RETURNS:
+ * The physical address for @addr.
+ */
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ bool in_first_chunk = false;
+ unsigned long first_low, first_high;
+ unsigned int cpu;
+
+ /*
+ * The following test on unit_low/high isn't strictly
+ * necessary but will speed up lookups of addresses which
+ * aren't in the first chunk.
+ */
+ first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
+ first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_low &&
+ (unsigned long)addr < first_high) {
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if (addr >= start && addr < start + pcpu_unit_size) {
+ in_first_chunk = true;
+ break;
+ }
+ }
+ }
+
+ if (in_first_chunk) {
+ if (!is_vmalloc_addr(addr))
+ return __pa(addr);
+ else
+ return page_to_phys(vmalloc_to_page(addr)) +
+ offset_in_page(addr);
+ } else
+ return page_to_phys(pcpu_addr_to_page(addr)) +
+ offset_in_page(addr);
+}
+
+/**
+ * pcpu_alloc_alloc_info - allocate percpu allocation info
+ * @nr_groups: the number of groups
+ * @nr_units: the number of units
+ *
+ * Allocate ai which is large enough for @nr_groups groups containing
+ * @nr_units units. The returned ai's groups[0].cpu_map points to the
+ * cpu_map array which is long enough for @nr_units and filled with
+ * NR_CPUS. It's the caller's responsibility to initialize cpu_map
+ * pointer of other groups.
+ *
+ * RETURNS:
+ * Pointer to the allocated pcpu_alloc_info on success, NULL on
+ * failure.
+ */
+struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
+ int nr_units)
+{
+ struct pcpu_alloc_info *ai;
+ size_t base_size, ai_size;
+ void *ptr;
+ int unit;
+
+ base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ __alignof__(ai->groups[0].cpu_map[0]));
+ ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
+
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
+ if (!ptr)
+ return NULL;
+ ai = ptr;
+ ptr += base_size;
+
+ ai->groups[0].cpu_map = ptr;
+
+ for (unit = 0; unit < nr_units; unit++)
+ ai->groups[0].cpu_map[unit] = NR_CPUS;
+
+ ai->nr_groups = nr_groups;
+ ai->__ai_size = PFN_ALIGN(ai_size);
+
+ return ai;
+}
+
+/**
+ * pcpu_free_alloc_info - free percpu allocation info
+ * @ai: pcpu_alloc_info to free
+ *
+ * Free @ai which was allocated by pcpu_alloc_alloc_info().
+ */
+void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
+{
+ memblock_free_early(__pa(ai), ai->__ai_size);
+}
+
+/**
+ * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
+ * @lvl: loglevel
+ * @ai: allocation info to dump
+ *
+ * Print out information about @ai using loglevel @lvl.
+ */
+static void pcpu_dump_alloc_info(const char *lvl,
+ const struct pcpu_alloc_info *ai)
+{
+ int group_width = 1, cpu_width = 1, width;
+ char empty_str[] = "--------";
+ int alloc = 0, alloc_end = 0;
+ int group, v;
+ int upa, apl; /* units per alloc, allocs per line */
+
+ v = ai->nr_groups;
+ while (v /= 10)
+ group_width++;
+
+ v = num_possible_cpus();
+ while (v /= 10)
+ cpu_width++;
+ empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
+
+ upa = ai->alloc_size / ai->unit_size;
+ width = upa * (cpu_width + 1) + group_width + 3;
+ apl = rounddown_pow_of_two(max(60 / width, 1));
+
+ printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
+ lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
+ ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
+
+ for (group = 0; group < ai->nr_groups; group++) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+ int unit = 0, unit_end = 0;
+
+ BUG_ON(gi->nr_units % upa);
+ for (alloc_end += gi->nr_units / upa;
+ alloc < alloc_end; alloc++) {
+ if (!(alloc % apl)) {
+ printk(KERN_CONT "\n");
+ printk("%spcpu-alloc: ", lvl);
+ }
+ printk(KERN_CONT "[%0*d] ", group_width, group);
+
+ for (unit_end += upa; unit < unit_end; unit++)
+ if (gi->cpu_map[unit] != NR_CPUS)
+ printk(KERN_CONT "%0*d ", cpu_width,
+ gi->cpu_map[unit]);
+ else
+ printk(KERN_CONT "%s ", empty_str);
+ }
+ }
+ printk(KERN_CONT "\n");
+}
+
+/**
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @ai: pcpu_alloc_info describing how to percpu area is shaped
+ * @base_addr: mapped address
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area. This function is to be called from arch percpu area
+ * setup path.
+ *
+ * @ai contains all information necessary to initialize the first
+ * chunk and prime the dynamic percpu allocator.
+ *
+ * @ai->static_size is the size of static percpu area.
+ *
+ * @ai->reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk. This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation. This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
+ * @ai->dyn_size determines the number of bytes available for dynamic
+ * allocation in the first chunk. The area between @ai->static_size +
+ * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
+ *
+ * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
+ * and equal to or larger than @ai->static_size + @ai->reserved_size +
+ * @ai->dyn_size.
+ *
+ * @ai->atom_size is the allocation atom size and used as alignment
+ * for vm areas.
+ *
+ * @ai->alloc_size is the allocation size and always multiple of
+ * @ai->atom_size. This is larger than @ai->atom_size if
+ * @ai->unit_size is larger than @ai->atom_size.
+ *
+ * @ai->nr_groups and @ai->groups describe virtual memory layout of
+ * percpu areas. Units which should be colocated are put into the
+ * same group. Dynamic VM areas will be allocated according to these
+ * groupings. If @ai->nr_groups is zero, a single group containing
+ * all units is assumed.
+ *
+ * The caller should have mapped the first chunk at @base_addr and
+ * copied static data to each unit.
+ *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area. They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other. The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
+{
+ static char cpus_buf[4096] __initdata;
+ static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+ static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
+ size_t dyn_size = ai->dyn_size;
+ size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
+ struct pcpu_chunk *schunk, *dchunk = NULL;
+ unsigned long *group_offsets;
+ size_t *group_sizes;
+ unsigned long *unit_off;
+ unsigned int cpu;
+ int *unit_map;
+ int group, unit, i;
+
+ cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+
+#define PCPU_SETUP_BUG_ON(cond) do { \
+ if (unlikely(cond)) { \
+ pr_emerg("PERCPU: failed to initialize, %s", #cond); \
+ pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pcpu_dump_alloc_info(KERN_EMERG, ai); \
+ BUG(); \
+ } \
+} while (0)
+
+ /* sanity checks */
+ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
+ PCPU_SETUP_BUG_ON(!ai->static_size);
+ PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+#endif
+ PCPU_SETUP_BUG_ON(!base_addr);
+ PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
+ PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
+
+ /* process group information and build config tables accordingly */
+ group_offsets = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_offsets[0]), 0);
+ group_sizes = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_sizes[0]), 0);
+ unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+ unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+ unit_map[cpu] = UINT_MAX;
+
+ pcpu_low_unit_cpu = NR_CPUS;
+ pcpu_high_unit_cpu = NR_CPUS;
+
+ for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
+ const struct pcpu_group_info *gi = &ai->groups[group];
+
+ group_offsets[group] = gi->base_offset;
+ group_sizes[group] = gi->nr_units * ai->unit_size;
+
+ for (i = 0; i < gi->nr_units; i++) {
+ cpu = gi->cpu_map[i];
+ if (cpu == NR_CPUS)
+ continue;
+
+ PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
+ PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
+
+ unit_map[cpu] = unit + i;
+ unit_off[cpu] = gi->base_offset + i * ai->unit_size;
+
+ /* determine low/high unit_cpu */
+ if (pcpu_low_unit_cpu == NR_CPUS ||
+ unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+ pcpu_low_unit_cpu = cpu;
+ if (pcpu_high_unit_cpu == NR_CPUS ||
+ unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+ pcpu_high_unit_cpu = cpu;
+ }
+ }
+ pcpu_nr_units = unit;
+
+ for_each_possible_cpu(cpu)
+ PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
+
+ /* we're done parsing the input, undefine BUG macro and dump config */
+#undef PCPU_SETUP_BUG_ON
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
+
+ pcpu_nr_groups = ai->nr_groups;
+ pcpu_group_offsets = group_offsets;
+ pcpu_group_sizes = group_sizes;
+ pcpu_unit_map = unit_map;
+ pcpu_unit_offsets = unit_off;
+
+ /* determine basic parameters */
+ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
+ pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
+ pcpu_atom_size = ai->atom_size;
+ pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
+
+ /*
+ * Allocate chunk slots. The additional last slot is for
+ * empty chunks.
+ */
+ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+ pcpu_slot = memblock_virt_alloc(
+ pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_slot[i]);
+
+ /*
+ * Initialize static chunk. If reserved_size is zero, the
+ * static chunk covers static area + dynamic allocation area
+ * in the first chunk. If reserved_size is not zero, it
+ * covers static area + reserved area (mostly used for module
+ * static percpu allocation).
+ */
+ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
+ INIT_LIST_HEAD(&schunk->list);
+ schunk->base_addr = base_addr;
+ schunk->map = smap;
+ schunk->map_alloc = ARRAY_SIZE(smap);
+ schunk->immutable = true;
+ bitmap_fill(schunk->populated, pcpu_unit_pages);
+
+ if (ai->reserved_size) {
+ schunk->free_size = ai->reserved_size;
+ pcpu_reserved_chunk = schunk;
+ pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
+ } else {
+ schunk->free_size = dyn_size;
+ dyn_size = 0; /* dynamic area covered */
+ }
+ schunk->contig_hint = schunk->free_size;
+
+ schunk->map[0] = 1;
+ schunk->map[1] = ai->static_size;
+ schunk->map_used = 1;
+ if (schunk->free_size)
+ schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
+ else
+ schunk->map[1] |= 1;
+
+ /* init dynamic chunk if necessary */
+ if (dyn_size) {
+ dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
+ INIT_LIST_HEAD(&dchunk->list);
+ dchunk->base_addr = base_addr;
+ dchunk->map = dmap;
+ dchunk->map_alloc = ARRAY_SIZE(dmap);
+ dchunk->immutable = true;
+ bitmap_fill(dchunk->populated, pcpu_unit_pages);
+
+ dchunk->contig_hint = dchunk->free_size = dyn_size;
+ dchunk->map[0] = 1;
+ dchunk->map[1] = pcpu_reserved_chunk_limit;
+ dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
+ dchunk->map_used = 2;
+ }
+
+ /* link the first chunk in */
+ pcpu_first_chunk = dchunk ?: schunk;
+ pcpu_chunk_relocate(pcpu_first_chunk, -1);
+
+ /* we're done */
+ pcpu_base_addr = base_addr;
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
+ [PCPU_FC_AUTO] = "auto",
+ [PCPU_FC_EMBED] = "embed",
+ [PCPU_FC_PAGE] = "page",
+};
+
+enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
+
+static int __init percpu_alloc_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (0)
+ /* nada */;
+#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
+ else if (!strcmp(str, "embed"))
+ pcpu_chosen_fc = PCPU_FC_EMBED;
+#endif
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ else if (!strcmp(str, "page"))
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ else
+ pr_warning("PERCPU: unknown allocator %s specified\n", str);
+
+ return 0;
+}
+early_param("percpu_alloc", percpu_alloc_setup);
+
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
+#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
+ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accommodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 1/3. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group_cnt[group]; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+
+#if defined(BUILD_EMBED_FIRST_CHUNK)
+/**
+ * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ * @alloc_fn: function to allocate percpu page
+ * @free_fn: function to free percpu page
+ *
+ * This is a helper to ease setting up embedded first percpu chunk and
+ * can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * If this function is used to setup the first chunk, it is allocated
+ * by calling @alloc_fn and used as-is without being mapped into
+ * vmalloc area. Allocations are always whole multiples of @atom_size
+ * aligned to @atom_size.
+ *
+ * This enables the first chunk to piggy back on the linear physical
+ * mapping which often uses larger page size. Please note that this
+ * can result in very sparse cpu->unit mapping on NUMA machines thus
+ * requiring large vmalloc address space. Don't use this allocator if
+ * vmalloc space is not orders of magnitude larger than distances
+ * between node memory addresses (ie. 32bit NUMA machines).
+ *
+ * @dyn_size specifies the minimum dynamic area size.
+ *
+ * If the needed size is smaller than the minimum or specified unit
+ * size, the leftover is returned using @free_fn.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn)
+{
+ void *base = (void *)ULONG_MAX;
+ void **areas = NULL;
+ struct pcpu_alloc_info *ai;
+ size_t size_sum, areas_size, max_distance;
+ int group, i, rc;
+
+ ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
+ cpu_distance_fn);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+
+ size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
+
+ areas = memblock_virt_alloc_nopanic(areas_size, 0);
+ if (!areas) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+
+ /* allocate, copy and determine base address */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ unsigned int cpu = NR_CPUS;
+ void *ptr;
+
+ for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
+ cpu = gi->cpu_map[i];
+ BUG_ON(cpu == NR_CPUS);
+
+ /* allocate space for the whole group */
+ ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ if (!ptr) {
+ rc = -ENOMEM;
+ goto out_free_areas;
+ }
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
+ areas[group] = ptr;
+
+ base = min(ptr, base);
+ }
+
+ /*
+ * Copy data and free unused parts. This should happen after all
+ * allocations are complete; otherwise, we may end up with
+ * overlapping groups.
+ */
+ for (group = 0; group < ai->nr_groups; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+ void *ptr = areas[group];
+
+ for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
+ if (gi->cpu_map[i] == NR_CPUS) {
+ /* unused unit, free whole */
+ free_fn(ptr, ai->unit_size);
+ continue;
+ }
+ /* copy and return the unused part */
+ memcpy(ptr, __per_cpu_load, ai->static_size);
+ free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ }
+ }
+
+ /* base address is now known, determine group base offsets */
+ max_distance = 0;
+ for (group = 0; group < ai->nr_groups; group++) {
+ ai->groups[group].base_offset = areas[group] - base;
+ max_distance = max_t(size_t, max_distance,
+ ai->groups[group].base_offset);
+ }
+ max_distance += ai->unit_size;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+ pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
+ "space 0x%lx\n", max_distance,
+ VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free;
+#endif
+ }
+
+ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ ai->dyn_size, ai->unit_size);
+
+ rc = pcpu_setup_first_chunk(ai, base);
+ goto out_free;
+
+out_free_areas:
+ for (group = 0; group < ai->nr_groups; group++)
+ if (areas[group])
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
+out_free:
+ pcpu_free_alloc_info(ai);
+ if (areas)
+ memblock_free_early(__pa(areas), areas_size);
+ return rc;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK */
+
+#ifdef BUILD_PAGE_FIRST_CHUNK
+/**
+ * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
+ * @populate_pte_fn: function to populate pte
+ *
+ * This is a helper to ease setting up page-remapped first percpu
+ * chunk and can be called where pcpu_setup_first_chunk() is expected.
+ *
+ * This is the basic allocator. Static percpu area is allocated
+ * page-by-page into vmalloc area.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init pcpu_page_first_chunk(size_t reserved_size,
+ pcpu_fc_alloc_fn_t alloc_fn,
+ pcpu_fc_free_fn_t free_fn,
+ pcpu_fc_populate_pte_fn_t populate_pte_fn)
+{
+ static struct vm_struct vm;
+ struct pcpu_alloc_info *ai;
+ char psize_str[16];
+ int unit_pages;
+ size_t pages_size;
+ struct page **pages;
+ int unit, i, j, rc;
+
+ snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
+
+ ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
+ if (IS_ERR(ai))
+ return PTR_ERR(ai);
+ BUG_ON(ai->nr_groups != 1);
+ BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+
+ unit_pages = ai->unit_size >> PAGE_SHIFT;
+
+ /* unaligned allocations can't be freed, round up to page size */
+ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+ sizeof(pages[0]));
+ pages = memblock_virt_alloc(pages_size, 0);
+
+ /* allocate pages */
+ j = 0;
+ for (unit = 0; unit < num_possible_cpus(); unit++)
+ for (i = 0; i < unit_pages; i++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
+ void *ptr;
+
+ ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ if (!ptr) {
+ pr_warning("PERCPU: failed to allocate %s page "
+ "for cpu%u\n", psize_str, cpu);
+ goto enomem;
+ }
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(ptr);
+ pages[j++] = virt_to_page(ptr);
+ }
+
+ /* allocate vm area, map the pages and copy static data */
+ vm.flags = VM_ALLOC;
+ vm.size = num_possible_cpus() * ai->unit_size;
+ vm_area_register_early(&vm, PAGE_SIZE);
+
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned long unit_addr =
+ (unsigned long)vm.addr + unit * ai->unit_size;
+
+ for (i = 0; i < unit_pages; i++)
+ populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+
+ /* pte already populated, the following shouldn't fail */
+ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
+ unit_pages);
+ if (rc < 0)
+ panic("failed to map percpu area, err=%d\n", rc);
+
+ /*
+ * FIXME: Archs with virtual cache should flush local
+ * cache for the linear mapping here - something
+ * equivalent to flush_cache_vmap() on the local cpu.
+ * flush_cache_vmap() can't be used as most supporting
+ * data structures are not set up yet.
+ */
+
+ /* copy static data */
+ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ }
+
+ /* we're ready, commit */
+ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ unit_pages, psize_str, vm.addr, ai->static_size,
+ ai->reserved_size, ai->dyn_size);
+
+ rc = pcpu_setup_first_chunk(ai, vm.addr);
+ goto out_free_ar;
+
+enomem:
+ while (--j >= 0)
+ free_fn(page_address(pages[j]), PAGE_SIZE);
+ rc = -ENOMEM;
+out_free_ar:
+ memblock_free_early(__pa(pages), pages_size);
+ pcpu_free_alloc_info(ai);
+ return rc;
+}
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
+/*
+ * Generic SMP percpu area setup.
+ *
+ * The embedding helper is used because its behavior closely resembles
+ * the original non-dynamic generic percpu area setup. This is
+ * important because many archs have addressing restrictions and might
+ * fail if the percpu area is located far away from the previous
+ * location. As an added bonus, in non-NUMA cases, embedding is
+ * generally a good idea TLB-wise because percpu area can piggy back
+ * on the physical linear memory mapping which uses large page
+ * mappings on applicable archs.
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+
+static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
+ size_t align)
+{
+ return memblock_virt_alloc_from_nopanic(
+ size, align, __pa(MAX_DMA_ADDRESS));
+}
+
+static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
+{
+ memblock_free_early(__pa(ptr), size);
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned long delta;
+ unsigned int cpu;
+ int rc;
+
+ /*
+ * Always reserve area for module percpu variables. That's
+ * what the legacy allocator did.
+ */
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+ PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
+ pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ if (rc < 0)
+ panic("Failed to initialize percpu areas.");
+
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu)
+ __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
+}
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#else /* CONFIG_SMP */
+
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+ const size_t unit_size =
+ roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+ PERCPU_DYNAMIC_RESERVE));
+ struct pcpu_alloc_info *ai;
+ void *fc;
+
+ ai = pcpu_alloc_alloc_info(1, 1);
+ fc = memblock_virt_alloc_from_nopanic(unit_size,
+ PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+ if (!ai || !fc)
+ panic("Failed to allocate memory for percpu areas.");
+ /* kmemleak tracks the percpu allocations separately */
+ kmemleak_free(fc);
+
+ ai->dyn_size = unit_size;
+ ai->unit_size = unit_size;
+ ai->atom_size = unit_size;
+ ai->alloc_size = unit_size;
+ ai->groups[0].nr_units = 1;
+ ai->groups[0].cpu_map[0] = 0;
+
+ if (pcpu_setup_first_chunk(ai, fc) < 0)
+ panic("Failed to initialize percpu areas.");
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * First and reserved chunks are initialized with temporary allocation
+ * map in initdata so that they can be used before slab is online.
+ * This function is called after slab is brought up and replaces those
+ * with properly allocated maps.
+ */
+void __init percpu_init_late(void)
+{
+ struct pcpu_chunk *target_chunks[] =
+ { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
+ struct pcpu_chunk *chunk;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; (chunk = target_chunks[i]); i++) {
+ int *map;
+ const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
+
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ map = pcpu_mem_zalloc(size);
+ BUG_ON(!map);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ memcpy(map, chunk->map, size);
+ chunk->map = map;
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 00000000000..a8b91992593
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,202 @@
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none. Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+ pud_ERROR(*pud);
+ pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+}
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_fix_spurious_fault(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#else
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ struct mm_struct *mm = (vma)->vm_mm;
+ pte_t pte;
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = pmd_mksplitting(*pmdp);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ /* tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
+ pmd_huge_pte(mm, pmdp) = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ if (list_empty(&pgtable->lru))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t entry = *pmdp;
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index b4e76c25f95..00000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004 Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
-#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponing priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- * |
- * A vm_set.head
- * / \ /
- * L R -> H-I-J-K-M-N-O-P-Q-S
- * ^ ^ <-- vm_set.list -->
- * tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL ==> a tree node
- * vma->shared.vm_set.head != NULL ==> list of others mapping same range
- * vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- * vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
- /* Leave these BUG_ONs till prio_tree patch stabilizes */
- BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
- BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
- vma->shared.vm_set.head = NULL;
- vma->shared.vm_set.parent = NULL;
-
- if (!old->shared.vm_set.parent)
- list_add(&vma->shared.vm_set.list,
- &old->shared.vm_set.list);
- else if (old->shared.vm_set.head)
- list_add_tail(&vma->shared.vm_set.list,
- &old->shared.vm_set.head->shared.vm_set.list);
- else {
- INIT_LIST_HEAD(&vma->shared.vm_set.list);
- vma->shared.vm_set.head = old;
- old->shared.vm_set.head = vma;
- }
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *old;
-
- vma->shared.vm_set.head = NULL;
-
- ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
- if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
- old = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- vma_prio_tree_add(vma, old);
- }
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct vm_area_struct *node, *head, *new_head;
-
- if (!vma->shared.vm_set.head) {
- if (!vma->shared.vm_set.parent)
- list_del_init(&vma->shared.vm_set.list);
- else
- raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
- } else {
- /* Leave this BUG_ON till prio_tree patch stabilizes */
- BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
- if (vma->shared.vm_set.parent) {
- head = vma->shared.vm_set.head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
- } else
- new_head = NULL;
-
- raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->shared.vm_set.head = new_head;
- if (new_head)
- new_head->shared.vm_set.head = head;
-
- } else {
- node = vma->shared.vm_set.head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->shared.vm_set.head = new_head;
- new_head->shared.vm_set.head = node;
- } else
- node->shared.vm_set.head = NULL;
- }
- }
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
- struct prio_tree_iter *iter)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *next;
-
- if (!vma) {
- /*
- * First call is with NULL vma
- */
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
- }
-
- if (vma->shared.vm_set.parent) {
- if (vma->shared.vm_set.head) {
- next = vma->shared.vm_set.head;
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- } else {
- next = list_entry(vma->shared.vm_set.list.next,
- struct vm_area_struct, shared.vm_set.list);
- if (!next->shared.vm_set.head) {
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- }
-
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
-}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 00000000000..5077afcd9e1
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,379 @@
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @pages: array of pointers to pages we want to copy
+ * @start_offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @iter: where to copy to/from locally
+ * @vm_write: 0 means copy from, 1 means copy to
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct page **pages,
+ unsigned offset,
+ size_t len,
+ struct iov_iter *iter,
+ int vm_write)
+{
+ /* Do the copy for each page */
+ while (len && iov_iter_count(iter)) {
+ struct page *page = *pages++;
+ size_t copy = PAGE_SIZE - offset;
+ size_t copied;
+
+ if (copy > len)
+ copy = len;
+
+ if (vm_write) {
+ copied = copy_page_from_iter(page, offset, copy, iter);
+ set_page_dirty_lock(page);
+ } else {
+ copied = copy_page_to_iter(page, offset, copy, iter);
+ }
+ len -= copied;
+ if (copied < copy && iov_iter_count(iter))
+ return -EFAULT;
+ offset = 0;
+ }
+ return 0;
+}
+
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @iter: where to copy to/from locally
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+ unsigned long len,
+ struct iov_iter *iter,
+ struct page **process_pages,
+ struct mm_struct *mm,
+ struct task_struct *task,
+ int vm_write)
+{
+ unsigned long pa = addr & PAGE_MASK;
+ unsigned long start_offset = addr - pa;
+ unsigned long nr_pages;
+ ssize_t rc = 0;
+ unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+ / sizeof(struct pages *);
+
+ /* Work out address and page range required */
+ if (len == 0)
+ return 0;
+ nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+ while (!rc && nr_pages && iov_iter_count(iter)) {
+ int pages = min(nr_pages, max_pages_per_loop);
+ size_t bytes;
+
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages(task, mm, pa, pages,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ if (pages <= 0)
+ return -EFAULT;
+
+ bytes = pages * PAGE_SIZE - start_offset;
+ if (bytes > len)
+ bytes = len;
+
+ rc = process_vm_rw_pages(process_pages,
+ start_offset, bytes, iter,
+ vm_write);
+ len -= bytes;
+ start_offset = 0;
+ nr_pages -= pages;
+ pa += pages * PAGE_SIZE;
+ while (pages)
+ put_page(process_pages[--pages]);
+ }
+
+ return rc;
+}
+
+/* Maximum number of entries for process pages array
+ which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @iter: where to copy to/from locally
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
+ const struct iovec *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct task_struct *task;
+ struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+ struct page **process_pages = pp_stack;
+ struct mm_struct *mm;
+ unsigned long i;
+ ssize_t rc = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_pages_iov;
+ ssize_t iov_len;
+ size_t total_len = iov_iter_count(iter);
+
+ /*
+ * Work out how many pages of struct pages we're going to need
+ * when eventually calling get_user_pages
+ */
+ for (i = 0; i < riovcnt; i++) {
+ iov_len = rvec[i].iov_len;
+ if (iov_len > 0) {
+ nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ + iov_len)
+ / PAGE_SIZE - (unsigned long)rvec[i].iov_base
+ / PAGE_SIZE + 1;
+ nr_pages = max(nr_pages, nr_pages_iov);
+ }
+ }
+
+ if (nr_pages == 0)
+ return 0;
+
+ if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+ /* For reliability don't try to kmalloc more than
+ 2 pages worth */
+ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+ sizeof(struct pages *)*nr_pages),
+ GFP_KERNEL);
+
+ if (!process_pages)
+ return -ENOMEM;
+ }
+
+ /* Get process information */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task) {
+ rc = -ESRCH;
+ goto free_proc_pages;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
+ if (!mm || IS_ERR(mm)) {
+ rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ /*
+ * Explicitly map EACCES to EPERM as EPERM is a more a
+ * appropriate error code for process_vw_readv/writev
+ */
+ if (rc == -EACCES)
+ rc = -EPERM;
+ goto put_task_struct;
+ }
+
+ for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
+ rc = process_vm_rw_single_vec(
+ (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+ iter, process_pages, mm, task, vm_write);
+
+ /* copied = space before - space after */
+ total_len -= iov_iter_count(iter);
+
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (total_len)
+ rc = total_len;
+
+ mmput(mm);
+
+put_task_struct:
+ put_task_struct(task);
+
+free_proc_pages:
+ if (process_pages != pp_stack)
+ kfree(process_pages);
+ return rc;
+}
+
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+ const struct iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
+ ssize_t rc;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* Check iovecs */
+ if (vm_write)
+ rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l);
+ else
+ rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
+
+ rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
+ iovstack_r, &iov_r);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+
+ return rc;
+}
+
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+ const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
+
+#ifdef CONFIG_COMPAT
+
+static ssize_t
+compat_process_vm_rw(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
+ ssize_t rc = -EFAULT;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (vm_write)
+ rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l);
+ else
+ rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l);
+ if (rc <= 0)
+ goto free_iovecs;
+ iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
+ rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
+ UIO_FASTIOV, iovstack_r,
+ &iov_r);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+ return rc;
+}
+
+COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
+ const struct compat_iovec __user *, lvec,
+ compat_ulong_t, liovcnt,
+ const struct compat_iovec __user *, rvec,
+ compat_ulong_t, riovcnt,
+ compat_ulong_t, flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
+ const struct compat_iovec __user *, lvec,
+ compat_ulong_t, liovcnt,
+ const struct compat_iovec __user *, rvec,
+ compat_ulong_t, riovcnt,
+ compat_ulong_t, flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 1);
+}
+
+#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
new file mode 100644
index 00000000000..94221297052
--- /dev/null
+++ b/mm/quicklist.c
@@ -0,0 +1,102 @@
+/*
+ * Quicklist support.
+ *
+ * Quicklists are light weight lists of pages that have a defined state
+ * on alloc and free. Pages must be in the quicklist specific defined state
+ * (zero by default) when the page is freed. It seems that the initial idea
+ * for such lists first came from Dave Miller and then various other people
+ * improved on it.
+ *
+ * Copyright (C) 2007 SGI,
+ * Christoph Lameter <clameter@sgi.com>
+ * Generalized, added support for multiple lists and
+ * constructors / destructors.
+ */
+#include <linux/kernel.h>
+
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/quicklist.h>
+
+DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
+
+#define FRACTION_OF_NODE_MEM 16
+
+static unsigned long max_pages(unsigned long min_pages)
+{
+ unsigned long node_free_pages, max;
+ int node = numa_node_id();
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int num_cpus_on_node;
+
+ node_free_pages =
+#ifdef CONFIG_ZONE_DMA
+ zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
+#endif
+ zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
+
+ max = node_free_pages / FRACTION_OF_NODE_MEM;
+
+ num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
+ max /= num_cpus_on_node;
+
+ return max(max, min_pages);
+}
+
+static long min_pages_to_free(struct quicklist *q,
+ unsigned long min_pages, long max_free)
+{
+ long pages_to_free;
+
+ pages_to_free = q->nr_pages - max_pages(min_pages);
+
+ return min(pages_to_free, max_free);
+}
+
+/*
+ * Trim down the number of pages in the quicklist
+ */
+void quicklist_trim(int nr, void (*dtor)(void *),
+ unsigned long min_pages, unsigned long max_free)
+{
+ long pages_to_free;
+ struct quicklist *q;
+
+ q = &get_cpu_var(quicklist)[nr];
+ if (q->nr_pages > min_pages) {
+ pages_to_free = min_pages_to_free(q, min_pages, max_free);
+
+ while (pages_to_free > 0) {
+ /*
+ * We pass a gfp_t of 0 to quicklist_alloc here
+ * because we will never call into the page allocator.
+ */
+ void *p = quicklist_alloc(nr, 0, NULL);
+
+ if (dtor)
+ dtor(p);
+ free_page((unsigned long)p);
+ pages_to_free--;
+ }
+ }
+ put_cpu_var(quicklist);
+}
+
+unsigned long quicklist_total_size(void)
+{
+ unsigned long count = 0;
+ int cpu;
+ struct quicklist *ql, *q;
+
+ for_each_online_cpu(cpu) {
+ ql = per_cpu(quicklist, cpu);
+ for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
+ count += q->nr_pages;
+ }
+ return count;
+}
+
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656..0ca36a7770b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -3,30 +3,22 @@
*
* Copyright (C) 2002, Linus Torvalds
*
- * 09Apr2002 akpm@zip.com.au
+ * 09Apr2002 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
-struct backing_dev_info default_backing_dev_info = {
- .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
- .state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
+#include "internal.h"
/*
* Initialise a struct file's readahead state. Assumes that the caller has
@@ -36,87 +28,48 @@ void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = mapping->backing_dev_info->ra_pages;
- ra->prev_page = -1;
-}
-
-/*
- * Return max readahead size for this inode in number-of-pages.
- */
-static inline unsigned long get_max_readahead(struct file_ra_state *ra)
-{
- return ra->ra_pages;
-}
-
-static inline unsigned long get_min_readahead(struct file_ra_state *ra)
-{
- return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ ra->prev_pos = -1;
}
+EXPORT_SYMBOL_GPL(file_ra_state_init);
-static inline void reset_ahead_window(struct file_ra_state *ra)
-{
- /*
- * ... but preserve ahead_start + ahead_size value,
- * see 'recheck:' label in page_cache_readahead().
- * Note: We never use ->ahead_size as rvalue without
- * checking ->ahead_start != 0 first.
- */
- ra->ahead_size += ra->ahead_start;
- ra->ahead_start = 0;
-}
-
-static inline void ra_off(struct file_ra_state *ra)
-{
- ra->start = 0;
- ra->flags = 0;
- ra->size = 0;
- reset_ahead_window(ra);
- return;
-}
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
/*
- * Set the initial window size, round to next power of 2 and square
- * for small size, x 4 for medium, and x 2 for large
- * for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
+ * see if a page needs releasing upon read_cache_pages() failure
+ * - the caller of read_cache_pages() may have set PG_private or PG_fscache
+ * before calling, such as the NFS fs marking pages that are cached locally
+ * on disk, thus we need to give the fs a chance to clean up in the event of
+ * an error
*/
-static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
+static void read_cache_pages_invalidate_page(struct address_space *mapping,
+ struct page *page)
{
- unsigned long newsize = roundup_pow_of_two(size);
-
- if (newsize <= max / 32)
- newsize = newsize * 4;
- else if (newsize <= max / 4)
- newsize = newsize * 2;
- else
- newsize = max;
- return newsize;
+ if (page_has_private(page)) {
+ if (!trylock_page(page))
+ BUG();
+ page->mapping = mapping;
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ page->mapping = NULL;
+ unlock_page(page);
+ }
+ page_cache_release(page);
}
/*
- * Set the new window size, this is called only when I/O is to be submitted,
- * not for each call to readahead. If a cache miss occured, reduce next I/O
- * size, else increase depending on how close to max we are.
+ * release a list of pages, invalidating them first if need be
*/
-static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
+static void read_cache_pages_invalidate_pages(struct address_space *mapping,
+ struct list_head *pages)
{
- unsigned long max = get_max_readahead(ra);
- unsigned long min = get_min_readahead(ra);
- unsigned long cur = ra->size;
- unsigned long newsize;
+ struct page *victim;
- if (ra->flags & RA_FLAG_MISS) {
- ra->flags &= ~RA_FLAG_MISS;
- newsize = max((cur - 2), min);
- } else if (cur < max / 16) {
- newsize = 4 * cur;
- } else {
- newsize = 2 * cur;
+ while (!list_empty(pages)) {
+ victim = list_to_page(pages);
+ list_del(&victim->lru);
+ read_cache_pages_invalidate_page(mapping, victim);
}
- return min(newsize, max);
}
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
/**
* read_cache_pages - populate an address space with some pages & start reads against them
* @mapping: the address_space
@@ -131,33 +84,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
{
struct page *page;
- struct pagevec lru_pvec;
int ret = 0;
- pagevec_init(&lru_pvec, 0);
-
while (!list_empty(pages)) {
page = list_to_page(pages);
list_del(&page->lru);
- if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
- page_cache_release(page);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL)) {
+ read_cache_pages_invalidate_page(mapping, page);
continue;
}
+ page_cache_release(page);
+
ret = filler(data, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- if (ret) {
- while (!list_empty(pages)) {
- struct page *victim;
-
- victim = list_to_page(pages);
- list_del(&victim->lru);
- page_cache_release(victim);
- }
+ if (unlikely(ret)) {
+ read_cache_pages_invalidate_pages(mapping, pages);
break;
}
+ task_io_account_read(PAGE_CACHE_SIZE);
}
- pagevec_lru_add(&lru_pvec);
return ret;
}
@@ -166,105 +111,47 @@ EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
+ struct blk_plug plug;
unsigned page_idx;
- struct pagevec lru_pvec;
int ret;
+ blk_start_plug(&plug);
+
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ /* Clean up the remaining pages */
+ put_pages_list(pages);
goto out;
}
- pagevec_init(&lru_pvec, 0);
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
- if (!add_to_page_cache(page, mapping,
+ if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_KERNEL)) {
mapping->a_ops->readpage(filp, page);
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
- } else
- page_cache_release(page);
+ }
+ page_cache_release(page);
}
- pagevec_lru_add(&lru_pvec);
ret = 0;
+
out:
+ blk_finish_plug(&plug);
+
return ret;
}
/*
- * Readahead design.
- *
- * The fields in struct file_ra_state represent the most-recently-executed
- * readahead attempt:
- *
- * start: Page index at which we started the readahead
- * size: Number of pages in that read
- * Together, these form the "current window".
- * Together, start and size represent the `readahead window'.
- * prev_page: The page which the readahead algorithm most-recently inspected.
- * It is mainly used to detect sequential file reading.
- * If page_cache_readahead sees that it is again being called for
- * a page which it just looked at, it can return immediately without
- * making any state changes.
- * ahead_start,
- * ahead_size: Together, these form the "ahead window".
- * ra_pages: The externally controlled max readahead for this fd.
- *
- * When readahead is in the off state (size == 0), readahead is disabled.
- * In this state, prev_page is used to detect the resumption of sequential I/O.
- *
- * The readahead code manages two windows - the "current" and the "ahead"
- * windows. The intent is that while the application is walking the pages
- * in the current window, I/O is underway on the ahead window. When the
- * current window is fully traversed, it is replaced by the ahead window
- * and the ahead window is invalidated. When this copying happens, the
- * new current window's pages are probably still locked. So
- * we submit a new batch of I/O immediately, creating a new ahead window.
- *
- * So:
- *
- * ----|----------------|----------------|-----
- * ^start ^start+size
- * ^ahead_start ^ahead_start+ahead_size
- *
- * ^ When this page is read, we submit I/O for the
- * ahead window.
- *
- * A `readahead hit' occurs when a read request is made against a page which is
- * the next sequential page. Ahead window calculations are done only when it
- * is time to submit a new IO. The code ramps up the size agressively at first,
- * but slow down as it approaches max_readhead.
- *
- * Any seek/ramdom IO will result in readahead being turned off. It will resume
- * at the first sequential access.
- *
- * There is a special-case: if the first page which the application tries to
- * read happens to be the first page of the file, it is assumed that a linear
- * read is about to happen and the window is immediately set to the initial size
- * based on I/O request size and the max_readahead.
- *
- * This function is to be called for every read request, rather than when
- * it is time to perform readahead. It is called only once for the entire I/O
- * regardless of size unless readahead is unable to start enough I/O to satisfy
- * the request (I/O request > max_readahead).
- */
-
-/*
- * do_page_cache_readahead actually reads a chunk of disk. It allocates all
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
* the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
- *
- * do_page_cache_readahead() returns -1 if it encountered request queue
- * congestion.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
@@ -277,32 +164,32 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (isize == 0)
goto out;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
- read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
-
+
if (page_offset > end_index)
break;
+ rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
- if (page)
+ rcu_read_unlock();
+ if (page && !radix_tree_exceptional_entry(page))
continue;
- read_unlock_irq(&mapping->tree_lock);
- page = page_cache_alloc_cold(mapping);
- read_lock_irq(&mapping->tree_lock);
+ page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
+ if (page_idx == nr_to_read - lookahead_size)
+ SetPageReadahead(page);
ret++;
}
- read_unlock_irq(&mapping->tree_lock);
/*
* Now start the IO. We ignore I/O errors - if the page is not
@@ -323,11 +210,10 @@ out:
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
- int ret = 0;
-
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
+ nr_to_read = max_sane_readahead(nr_to_read);
while (nr_to_read) {
int err;
@@ -336,250 +222,360 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
- offset, this_chunk);
- if (err < 0) {
- ret = err;
- break;
- }
- ret += err;
+ offset, this_chunk, 0);
+ if (err < 0)
+ return err;
+
offset += this_chunk;
nr_to_read -= this_chunk;
}
- return ret;
+ return 0;
}
+#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
/*
- * Check how effective readahead is being. If the amount of started IO is
- * less than expected then the file is partly or fully in pagecache and
- * readahead isn't helping.
- *
+ * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
+ * sensible upper limit.
*/
-static inline int check_ra_success(struct file_ra_state *ra,
- unsigned long nr_to_read, unsigned long actual)
+unsigned long max_sane_readahead(unsigned long nr)
{
- if (actual == 0) {
- ra->cache_hit += nr_to_read;
- if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
- ra_off(ra);
- ra->flags |= RA_FLAG_INCACHE;
- return 0;
- }
- } else {
- ra->cache_hit=0;
- }
- return 1;
+ return min(nr, MAX_READAHEAD);
}
/*
- * This version skips the IO if the queue is read-congested, and will tell the
- * block layer to abandon the readahead if request allocation would block.
- *
- * force_page_cache_readahead() will ignore queue congestion and will block on
- * request queues.
+ * Set the initial window size, round to next power of 2 and square
+ * for small size, x 4 for medium, and x 2 for large
+ * for 128k (32 page) max ra
+ * 1-8 page = 32k initial, > 8 page = 128k initial
*/
-int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
- if (bdi_read_congested(mapping->backing_dev_info))
- return -1;
+ unsigned long newsize = roundup_pow_of_two(size);
- return __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+ if (newsize <= max / 32)
+ newsize = newsize * 4;
+ else if (newsize <= max / 4)
+ newsize = newsize * 2;
+ else
+ newsize = max;
+
+ return newsize;
}
/*
- * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
- * is set wait till the read completes. Otherwise attempt to read without
- * blocking.
- * Returns 1 meaning 'success' if read is successful without switching off
- * readahead mode. Otherwise return failure.
+ * Get the previous window size, ramp it up, and
+ * return it as the new window size.
*/
-static int
-blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read,
- struct file_ra_state *ra, int block)
+static unsigned long get_next_ra_size(struct file_ra_state *ra,
+ unsigned long max)
{
- int actual;
-
- if (!block && bdi_read_congested(mapping->backing_dev_info))
- return 0;
+ unsigned long cur = ra->size;
+ unsigned long newsize;
- actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);
+ if (cur < max / 16)
+ newsize = 4 * cur;
+ else
+ newsize = 2 * cur;
- return check_ra_success(ra, nr_to_read, actual);
+ return min(newsize, max);
}
-static int make_ahead_window(struct address_space *mapping, struct file *filp,
- struct file_ra_state *ra, int force)
+/*
+ * On-demand readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * |<----- async_size ---------|
+ * |------------------- size -------------------->|
+ * |==================#===========================|
+ * ^start ^page marked with PG_readahead
+ *
+ * To overlap application thinking time and disk I/O time, we do
+ * `readahead pipelining': Do not wait until the application consumed all
+ * readahead pages and stalled on the missing page at readahead_index;
+ * Instead, submit an asynchronous readahead I/O as soon as there are
+ * only async_size pages left in the readahead window. Normally async_size
+ * will be equal to size, for maximum pipelining.
+ *
+ * In interleaved sequential reads, concurrent streams on the same fd can
+ * be invalidating each other's readahead state. So we flag the new readahead
+ * page at (start+size-async_size) with PG_readahead, and use it as readahead
+ * indicator. The flag won't be set on already cached pages, to avoid the
+ * readahead-for-nothing fuss, saving pointless page cache lookups.
+ *
+ * prev_pos tracks the last visited byte in the _previous_ read request.
+ * It should be maintained by the caller, and will be used for detecting
+ * small random reads. Note that the readahead algorithm checks loosely
+ * for sequential patterns. Hence interleaved reads might be served as
+ * sequential ones.
+ *
+ * There is a special-case: if the first page which the application tries to
+ * read happens to be the first page of the file, it is assumed that a linear
+ * read is about to happen and the window is immediately set to the initial size
+ * based on I/O request size and the max_readahead.
+ *
+ * The code ramps up the readahead size aggressively at first, but slow down as
+ * it approaches max_readhead.
+ */
+
+/*
+ * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * this count is a conservative estimation of
+ * - length of the sequential read sequence, or
+ * - thrashing threshold in memory tight systems
+ */
+static pgoff_t count_history_pages(struct address_space *mapping,
+ struct file_ra_state *ra,
+ pgoff_t offset, unsigned long max)
{
- int block, ret;
-
- ra->ahead_size = get_next_ra_size(ra);
- ra->ahead_start = ra->start + ra->size;
-
- block = force || (ra->prev_page >= ra->ahead_start);
- ret = blockable_page_cache_readahead(mapping, filp,
- ra->ahead_start, ra->ahead_size, ra, block);
-
- if (!ret && !force) {
- /* A read failure in blocking mode, implies pages are
- * all cached. So we can safely assume we have taken
- * care of all the pages requested in this call.
- * A read failure in non-blocking mode, implies we are
- * reading more pages than requested in this call. So
- * we safely assume we have taken care of all the pages
- * requested in this call.
- *
- * Just reset the ahead window in case we failed due to
- * congestion. The ahead window will any way be closed
- * in case we failed due to excessive page cache hits.
- */
- reset_ahead_window(ra);
- }
+ pgoff_t head;
- return ret;
+ rcu_read_lock();
+ head = page_cache_prev_hole(mapping, offset - 1, max);
+ rcu_read_unlock();
+
+ return offset - 1 - head;
}
-/**
- * page_cache_readahead - generic adaptive readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
- * @req_size: hint: total size of the read which the caller is performing in
- * PAGE_CACHE_SIZE units
- *
- * page_cache_readahead() is the main function. If performs the adaptive
- * readahead window size management and submits the readahead I/O.
- *
- * Note that @filp is purely used for passing on to the ->readpage[s]()
- * handler: it may refer to a different file from @mapping (so we may not use
- * @filp->f_mapping or @filp->f_dentry->d_inode here).
- * Also, @ra may not be equal to &@filp->f_ra.
- *
+/*
+ * page cache context based read-ahead
*/
-unsigned long
-page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
- struct file *filp, pgoff_t offset, unsigned long req_size)
+static int try_context_readahead(struct address_space *mapping,
+ struct file_ra_state *ra,
+ pgoff_t offset,
+ unsigned long req_size,
+ unsigned long max)
{
- unsigned long max, newsize;
- int sequential;
+ pgoff_t size;
+
+ size = count_history_pages(mapping, ra, offset, max);
/*
- * We avoid doing extra work and bogusly perturbing the readahead
- * window expansion logic.
+ * not enough history pages:
+ * it could be a random read
*/
- if (offset == ra->prev_page && --req_size)
- ++offset;
+ if (size <= req_size)
+ return 0;
- /* Note that prev_page == -1 if it is a first read */
- sequential = (offset == ra->prev_page + 1);
- ra->prev_page = offset;
+ /*
+ * starts from beginning of file:
+ * it is a strong indication of long-run stream (or whole-file-read)
+ */
+ if (size >= offset)
+ size *= 2;
- max = get_max_readahead(ra);
- newsize = min(req_size, max);
+ ra->start = offset;
+ ra->size = min(size + req_size, max);
+ ra->async_size = 1;
- /* No readahead or sub-page sized read or file already in cache */
- if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
- goto out;
+ return 1;
+}
- ra->prev_page += newsize - 1;
+/*
+ * A minimal readahead algorithm for trivial sequential/random reads.
+ */
+static unsigned long
+ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t offset,
+ unsigned long req_size)
+{
+ unsigned long max = max_sane_readahead(ra->ra_pages);
+ pgoff_t prev_offset;
/*
- * Special case - first read at start of file. We'll assume it's
- * a whole-file read and grow the window fast. Or detect first
- * sequential access
+ * start of file
*/
- if (sequential && ra->size == 0) {
- ra->size = get_init_ra_size(newsize, max);
- ra->start = offset;
- if (!blockable_page_cache_readahead(mapping, filp, offset,
- ra->size, ra, 1))
- goto out;
-
- /*
- * If the request size is larger than our max readahead, we
- * at least want to be sure that we get 2 IOs in flight and
- * we know that we will definitly need the new I/O.
- * once we do this, subsequent calls should be able to overlap
- * IOs,* thus preventing stalls. so issue the ahead window
- * immediately.
- */
- if (req_size >= max)
- make_ahead_window(mapping, filp, ra, 1);
+ if (!offset)
+ goto initial_readahead;
- goto out;
+ /*
+ * It's the expected callback offset, assume sequential access.
+ * Ramp up sizes, and push forward the readahead window.
+ */
+ if ((offset == (ra->start + ra->size - ra->async_size) ||
+ offset == (ra->start + ra->size))) {
+ ra->start += ra->size;
+ ra->size = get_next_ra_size(ra, max);
+ ra->async_size = ra->size;
+ goto readit;
}
/*
- * Now handle the random case:
- * partial page reads and first access were handled above,
- * so this must be the next page otherwise it is random
+ * Hit a marked page without valid readahead state.
+ * E.g. interleaved reads.
+ * Query the pagecache for async_size, which normally equals to
+ * readahead size. Ramp it up and use it as the new readahead size.
*/
- if (!sequential) {
- ra_off(ra);
- blockable_page_cache_readahead(mapping, filp, offset,
- newsize, ra, 1);
- goto out;
+ if (hit_readahead_marker) {
+ pgoff_t start;
+
+ rcu_read_lock();
+ start = page_cache_next_hole(mapping, offset + 1, max);
+ rcu_read_unlock();
+
+ if (!start || start - offset > max)
+ return 0;
+
+ ra->start = start;
+ ra->size = start - offset; /* old async_size */
+ ra->size += req_size;
+ ra->size = get_next_ra_size(ra, max);
+ ra->async_size = ra->size;
+ goto readit;
}
/*
- * If we get here we are doing sequential IO and this was not the first
- * occurence (ie we have an existing window)
+ * oversize read
*/
- if (ra->ahead_start == 0) { /* no ahead window yet */
- if (!make_ahead_window(mapping, filp, ra, 0))
- goto recheck;
- }
+ if (req_size > max)
+ goto initial_readahead;
+
+ /*
+ * sequential cache miss
+ * trivial case: (offset - prev_offset) == 1
+ * unaligned reads: (offset - prev_offset) == 0
+ */
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ if (offset - prev_offset <= 1UL)
+ goto initial_readahead;
+
+ /*
+ * Query the page cache and look for the traces(cached history pages)
+ * that a sequential stream would leave behind.
+ */
+ if (try_context_readahead(mapping, ra, offset, req_size, max))
+ goto readit;
+
+ /*
+ * standalone, small random read
+ * Read as is, and do not pollute the readahead state.
+ */
+ return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+
+initial_readahead:
+ ra->start = offset;
+ ra->size = get_init_ra_size(req_size, max);
+ ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
+readit:
/*
- * Already have an ahead window, check if we crossed into it.
- * If so, shift windows and issue a new ahead window.
- * Only return the #pages that are in the current window, so that
- * we get called back on the first page of the ahead window which
- * will allow us to submit more IO.
+ * Will this read hit the readahead marker made by itself?
+ * If so, trigger the readahead marker hit now, and merge
+ * the resulted next readahead window into the current one.
*/
- if (ra->prev_page >= ra->ahead_start) {
- ra->start = ra->ahead_start;
- ra->size = ra->ahead_size;
- make_ahead_window(mapping, filp, ra, 0);
-recheck:
- /* prev_page shouldn't overrun the ahead window */
- ra->prev_page = min(ra->prev_page,
- ra->ahead_start + ra->ahead_size - 1);
+ if (offset == ra->start && ra->size == ra->async_size) {
+ ra->async_size = get_next_ra_size(ra, max);
+ ra->size += ra->async_size;
}
-out:
- return ra->prev_page + 1;
+ return ra_submit(ra, mapping, filp);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead);
-/*
- * handle_ra_miss() is called when it is known that a page which should have
- * been present in the pagecache (we just did some readahead there) was in fact
- * not found. This will happen if it was evicted by the VM (readahead
- * thrashing)
+/**
+ * page_cache_sync_readahead - generic file readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
*
- * Turn on the cache miss flag in the RA struct, this will cause the RA code
- * to reduce the RA size on the next read.
+ * page_cache_sync_readahead() should be called when a cache miss happened:
+ * it will submit the read. The readahead logic may decide to piggyback more
+ * pages onto the read request if access patterns suggest it will improve
+ * performance.
*/
-void handle_ra_miss(struct address_space *mapping,
- struct file_ra_state *ra, pgoff_t offset)
+void page_cache_sync_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ pgoff_t offset, unsigned long req_size)
{
- ra->flags |= RA_FLAG_MISS;
- ra->flags &= ~RA_FLAG_INCACHE;
- ra->cache_hit = 0;
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /* be dumb */
+ if (filp && (filp->f_mode & FMODE_RANDOM)) {
+ force_page_cache_readahead(mapping, filp, offset, req_size);
+ return;
+ }
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
+EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
+/**
+ * page_cache_async_readahead - file readahead for marked pages
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @filp: passed on to ->readpage() and ->readpages()
+ * @page: the page at @offset which has the PG_readahead flag set
+ * @offset: start offset into @mapping, in pagecache page-sized units
+ * @req_size: hint: total size of the read which the caller is performing in
+ * pagecache pages
+ *
+ * page_cache_async_readahead() should be called when a page is used which
+ * has the PG_readahead flag; this is a marker to suggest that the application
+ * has used up enough of the readahead window that we should start pulling in
+ * more pages.
*/
-unsigned long max_sane_readahead(unsigned long nr)
+void
+page_cache_async_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ struct page *page, pgoff_t offset,
+ unsigned long req_size)
+{
+ /* no read-ahead */
+ if (!ra->ra_pages)
+ return;
+
+ /*
+ * Same bit is used for PG_readahead and PG_reclaim.
+ */
+ if (PageWriteback(page))
+ return;
+
+ ClearPageReadahead(page);
+
+ /*
+ * Defer asynchronous read-ahead on IO congestion.
+ */
+ if (bdi_read_congested(mapping->backing_dev_info))
+ return;
+
+ /* do read-ahead */
+ ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+}
+EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+ pgoff_t index, unsigned long nr)
{
- unsigned long active;
- unsigned long inactive;
- unsigned long free;
+ if (!mapping || !mapping->a_ops)
+ return -EINVAL;
- __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
- return min(nr, (inactive + free) / 2);
+ return force_page_cache_readahead(mapping, filp, index, nr);
+}
+
+SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
+{
+ ssize_t ret;
+ struct fd f;
+
+ ret = -EBADF;
+ f = fdget(fd);
+ if (f.file) {
+ if (f.file->f_mode & FMODE_READ) {
+ struct address_space *mapping = f.file->f_mapping;
+ pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long len = end - start + 1;
+ ret = do_readahead(mapping, f.file, start, len);
+ }
+ fdput(f);
+ }
+ return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index e2155d791d9..22a4a7699cd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -14,34 +14,32 @@
* Original design by Rik van Riel <riel@conectiva.com.br> 2001
* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
- * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
+ * Contributions by Hugh Dickins 2003, 2004
*/
/*
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * inode->i_alloc_sem
- *
- * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_mutex is taken outermost.
- *
- * mm->mmap_sem
- * page->flags PG_locked (lock_page)
- * mapping->i_mmap_lock
- * anon_vma->lock
- * mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * inode_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within inode_lock in __sync_single_inode)
+ * mm->mmap_sem
+ * page->flags PG_locked (lock_page)
+ * mapping->i_mmap_mutex
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * mapping->tree_lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
+ *
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * ->tasklist_lock
+ * pte map lock
*/
#include <linux/mm.h>
@@ -50,188 +48,505 @@
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
-struct kmem_cache *anon_vma_cachep;
+#include "internal.h"
+
+static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
+static inline struct anon_vma *anon_vma_alloc(void)
{
-#ifdef CONFIG_DEBUG_VM
- struct anon_vma *anon_vma = find_vma->anon_vma;
- struct vm_area_struct *vma;
- unsigned int mapcount = 0;
- int found = 0;
+ struct anon_vma *anon_vma;
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- mapcount++;
- BUG_ON(mapcount > 100000);
- if (vma == find_vma)
- found = 1;
+ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ if (anon_vma) {
+ atomic_set(&anon_vma->refcount, 1);
+ /*
+ * Initialise the anon_vma root to point to itself. If called
+ * from fork, the root will be reset to the parents anon_vma.
+ */
+ anon_vma->root = anon_vma;
}
- BUG_ON(!found);
-#endif
+
+ return anon_vma;
}
-/* This must be called under the mmap_sem. */
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+ VM_BUG_ON(atomic_read(&anon_vma->refcount));
+
+ /*
+ * Synchronize against page_lock_anon_vma_read() such that
+ * we can safely hold the lock without the anon_vma getting
+ * freed.
+ *
+ * Relies on the full mb implied by the atomic_dec_and_test() from
+ * put_anon_vma() against the acquire barrier implied by
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+ *
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
+ * LOCK MB
+ * atomic_read() rwsem_is_locked()
+ *
+ * LOCK should suffice since the actual taking of the lock must
+ * happen _before_ what follows.
+ */
+ might_sleep();
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_unlock_write(anon_vma);
+ }
+
+ kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
+{
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
+}
+
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
+
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
+{
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
+}
+
+/**
+ * anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, but if
+ * not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_sem held for reading.
+ */
int anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
- struct anon_vma *allocated, *locked;
+ struct anon_vma *allocated;
+
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_enomem;
anon_vma = find_mergeable_anon_vma(vma);
- if (anon_vma) {
- allocated = NULL;
- locked = anon_vma;
- spin_lock(&locked->lock);
- } else {
+ allocated = NULL;
+ if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
- return -ENOMEM;
+ goto out_enomem_free_avc;
allocated = anon_vma;
- locked = NULL;
}
+ anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+ anon_vma_chain_link(vma, avc, anon_vma);
allocated = NULL;
+ avc = NULL;
}
spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(anon_vma);
- if (locked)
- spin_unlock(&locked->lock);
if (unlikely(allocated))
- anon_vma_free(allocated);
+ put_anon_vma(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
}
return 0;
-}
-void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
-{
- BUG_ON(vma->anon_vma != next->anon_vma);
- list_del(&next->anon_vma_node);
+ out_enomem_free_avc:
+ anon_vma_chain_free(avc);
+ out_enomem:
+ return -ENOMEM;
}
-void __anon_vma_link(struct vm_area_struct *vma)
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
-
- if (anon_vma) {
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ up_write(&root->rwsem);
+ root = new_root;
+ down_write(&root->rwsem);
}
+ return root;
}
-void anon_vma_link(struct vm_area_struct *vma)
+static inline void unlock_anon_vma_root(struct anon_vma *root)
{
- struct anon_vma *anon_vma = vma->anon_vma;
+ if (root)
+ up_write(&root->rwsem);
+}
- if (anon_vma) {
- spin_lock(&anon_vma->lock);
- list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
- spin_unlock(&anon_vma->lock);
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+{
+ struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
+
+ list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
}
+ unlock_anon_vma_root(root);
+ return 0;
+
+ enomem_failure:
+ unlink_anon_vmas(dst);
+ return -ENOMEM;
}
-void anon_vma_unlink(struct vm_area_struct *vma)
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
- int empty;
+ struct anon_vma_chain *avc;
+ struct anon_vma *anon_vma;
+
+ /* Don't bother if the parent process has no anon_vma here. */
+ if (!pvma->anon_vma)
+ return 0;
+ /*
+ * First, attach the new VMA to the parent VMA's anon_vmas,
+ * so rmap can find non-COWed pages in child processes.
+ */
+ if (anon_vma_clone(vma, pvma))
+ return -ENOMEM;
+
+ /* Then add our own anon_vma. */
+ anon_vma = anon_vma_alloc();
if (!anon_vma)
- return;
+ goto out_error;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_error_free_anon_vma;
- spin_lock(&anon_vma->lock);
- validate_anon_vma(vma);
- list_del(&vma->anon_vma_node);
+ /*
+ * The root anon_vma's spinlock is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ /*
+ * With refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned until
+ * this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
+ /* Mark this anon_vma as the one where our new (COWed) pages go. */
+ vma->anon_vma = anon_vma;
+ anon_vma_lock_write(anon_vma);
+ anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_unlock_write(anon_vma);
- /* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
+ return 0;
- if (empty)
- anon_vma_free(anon_vma);
+ out_error_free_anon_vma:
+ put_anon_vma(anon_vma);
+ out_error:
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
}
-static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
- unsigned long flags)
+void unlink_anon_vmas(struct vm_area_struct *vma)
{
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- struct anon_vma *anon_vma = data;
+ struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
+
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root))
+ continue;
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ unlock_anon_vma_root(root);
- spin_lock_init(&anon_vma->lock);
- INIT_LIST_HEAD(&anon_vma->head);
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to write-acquire the anon_vma->root->rwsem.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ put_anon_vma(anon_vma);
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
}
}
+static void anon_vma_ctor(void *data)
+{
+ struct anon_vma *anon_vma = data;
+
+ init_rwsem(&anon_vma->rwsem);
+ atomic_set(&anon_vma->refcount, 0);
+ anon_vma->rb_root = RB_ROOT;
+}
+
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
}
/*
- * Getting a lock on a stable anon_vma from a page off the LRU is
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
*/
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) page->mapping;
- if (!(anon_mapping & PAGE_MAPPING_ANON))
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- spin_lock(&anon_vma->lock);
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ /*
+ * If this page is still mapped, then its anon_vma cannot have been
+ * freed. But if it has been unmapped, we have no security against the
+ * anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
+ * above cannot corrupt).
+ */
+ if (!page_mapped(page)) {
+ rcu_read_unlock();
+ put_anon_vma(anon_vma);
+ return NULL;
+ }
out:
rcu_read_unlock();
+
return anon_vma;
}
/*
- * At what user virtual address is page expected in vma?
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
+{
+ struct anon_vma *anon_vma = NULL;
+ struct anon_vma *root_anon_vma;
+ unsigned long anon_mapping;
+
+ rcu_read_lock();
+ anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ goto out;
+ if (!page_mapped(page))
+ goto out;
+
+ anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
+ /*
+ * If the page is still mapped, then this anon_vma is still
+ * its anon_vma, and holding the mutex ensures that it will
+ * not go away, see anon_vma_free().
+ */
+ if (!page_mapped(page)) {
+ up_read(&root_anon_vma->rwsem);
+ anon_vma = NULL;
+ }
+ goto out;
+ }
+
+ /* trylock failed, we got to sleep */
+ if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+ anon_vma = NULL;
+ goto out;
+ }
+
+ if (!page_mapped(page)) {
+ rcu_read_unlock();
+ put_anon_vma(anon_vma);
+ return NULL;
+ }
+
+ /* we pinned the anon_vma, its safe to sleep */
+ rcu_read_unlock();
+ anon_vma_lock_read(anon_vma);
+
+ if (atomic_dec_and_test(&anon_vma->refcount)) {
+ /*
+ * Oops, we held the last refcount, release the lock
+ * and bail -- can't simply use put_anon_vma() because
+ * we'll deadlock on the anon_vma_lock_write() recursion.
+ */
+ anon_vma_unlock_read(anon_vma);
+ __put_anon_vma(anon_vma);
+ anon_vma = NULL;
+ }
+
+ return anon_vma;
+
+out:
+ rcu_read_unlock();
+ return anon_vma;
+}
+
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
+{
+ anon_vma_unlock_read(anon_vma);
+}
+
+/*
+ * At what user virtual address is page expected in @vma?
*/
static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long address;
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- /* page should be within any vma from prio_tree_next */
- BUG_ON(!PageAnon(page));
- return -EFAULT;
- }
return address;
}
/*
- * At what user virtual address is page expected in vma? checking that the
- * page matches the vma: currently only used on anon pages, by unuse_vma;
+ * At what user virtual address is page expected in vma?
+ * Caller should check the page is actually part of the vma.
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
+ unsigned long address;
if (PageAnon(page)) {
- if ((void *)vma->anon_vma !=
- (void *)page->mapping - PAGE_MAPPING_ANON)
+ struct anon_vma *page__anon_vma = page_anon_vma(page);
+ /*
+ * Note: swapoff's unuse_vma() is more efficient with this
+ * check, and needs it to match anon_vma when KSM is active.
+ */
+ if (!vma->anon_vma || !page__anon_vma ||
+ vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@@ -239,43 +554,79 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
} else
return -EFAULT;
- return vma_address(page, vma);
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ return -EFAULT;
+ return address;
+}
+
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+ pmd_t pmde;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /*
+ * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = ACCESS_ONCE(*pmd);
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ pmd = NULL;
+out:
+ return pmd;
}
/*
* Check that @page is mapped at @address into @mm.
*
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
* On success returns with pte mapped and locked.
*/
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address, spinlock_t **ptlp)
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
+ unsigned long address, spinlock_t **ptlp, int sync)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ goto check;
+ }
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
- if (!pte_present(*pte)) {
+ if (!sync && !pte_present(*pte)) {
pte_unmap(pte);
return NULL;
}
ptl = pte_lockptr(mm, pmd);
+check:
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -285,242 +636,329 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
return NULL;
}
-/*
- * Subfunctions of page_referenced: page_referenced_one called
- * repeatedly from either page_referenced_anon or page_referenced_file.
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA. Only
+ * valid for normal file or anonymous VMAs.
*/
-static int page_referenced_one(struct page *page,
- struct vm_area_struct *vma, unsigned int *mapcount)
+int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
spinlock_t *ptl;
- int referenced = 0;
-
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
- pte = page_check_address(page, mm, address, &ptl);
- if (!pte)
- goto out;
-
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
-
- /* Pretend the page is referenced if the task has the
- swap token and is in the middle of a page fault. */
- if (mm != current->mm && has_swap_token(mm) &&
- rwsem_is_locked(&mm->mmap_sem))
- referenced++;
-
- (*mapcount)--;
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ return 0;
+ pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
+ if (!pte) /* the page is not in this mm */
+ return 0;
pte_unmap_unlock(pte, ptl);
-out:
- return referenced;
-}
-static int page_referenced_anon(struct page *page)
-{
- unsigned int mapcount;
- struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
- int referenced = 0;
-
- anon_vma = page_lock_anon_vma(page);
- if (!anon_vma)
- return referenced;
-
- mapcount = page_mapcount(page);
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- referenced += page_referenced_one(page, vma, &mapcount);
- if (!mapcount)
- break;
- }
- spin_unlock(&anon_vma->lock);
- return referenced;
+ return 1;
}
-/**
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag. This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds. It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
+struct page_referenced_arg {
+ int mapcount;
+ int referenced;
+ unsigned long vm_flags;
+ struct mem_cgroup *memcg;
+};
+/*
+ * arg: page_referenced_arg will be passed
*/
-static int page_referenced_file(struct page *page)
+static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
{
- unsigned int mapcount;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
int referenced = 0;
+ struct page_referenced_arg *pra = arg;
- /*
- * The caller's checks on page->mapping and !PageAnon have made
- * sure that this is a file page: the check for page->mapping
- * excludes the case just before it gets set on an anon page.
- */
- BUG_ON(PageAnon(page));
+ if (unlikely(PageTransHuge(page))) {
+ pmd_t *pmd;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_lock.
- */
- BUG_ON(!PageLocked(page));
+ /*
+ * rmap might return false positives; we must filter
+ * these out using page_check_address_pmd().
+ */
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+ if (!pmd)
+ return SWAP_AGAIN;
+
+ if (vma->vm_flags & VM_LOCKED) {
+ spin_unlock(ptl);
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
- spin_lock(&mapping->i_mmap_lock);
+ /* go ahead even if the pmd is pmd_trans_splitting() */
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ spin_unlock(ptl);
+ } else {
+ pte_t *pte;
- /*
- * i_mmap_lock does not stabilize mapcount at all, but mapcount
- * is more likely to be accurate if we note it after spinning.
- */
- mapcount = page_mapcount(page);
+ /*
+ * rmap might return false positives; we must filter
+ * these out using page_check_address().
+ */
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ return SWAP_AGAIN;
+
+ if (vma->vm_flags & VM_LOCKED) {
+ pte_unmap_unlock(pte, ptl);
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
- == (VM_LOCKED|VM_MAYSHARE)) {
- referenced++;
- break;
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ referenced++;
}
- referenced += page_referenced_one(page, vma, &mapcount);
- if (!mapcount)
- break;
+ pte_unmap_unlock(pte, ptl);
+ }
+
+ if (referenced) {
+ pra->referenced++;
+ pra->vm_flags |= vma->vm_flags;
}
- spin_unlock(&mapping->i_mmap_lock);
- return referenced;
+ pra->mapcount--;
+ if (!pra->mapcount)
+ return SWAP_SUCCESS; /* To break the loop */
+
+ return SWAP_AGAIN;
+}
+
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+{
+ struct page_referenced_arg *pra = arg;
+ struct mem_cgroup *memcg = pra->memcg;
+
+ if (!mm_match_cgroup(vma->vm_mm, memcg))
+ return true;
+
+ return false;
}
/**
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
+ * @memcg: target memory cgroup
+ * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
-int page_referenced(struct page *page, int is_locked)
+int page_referenced(struct page *page,
+ int is_locked,
+ struct mem_cgroup *memcg,
+ unsigned long *vm_flags)
{
- int referenced = 0;
+ int ret;
+ int we_locked = 0;
+ struct page_referenced_arg pra = {
+ .mapcount = page_mapcount(page),
+ .memcg = memcg,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_referenced_one,
+ .arg = (void *)&pra,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ *vm_flags = 0;
+ if (!page_mapped(page))
+ return 0;
- if (page_test_and_clear_young(page))
- referenced++;
+ if (!page_rmapping(page))
+ return 0;
- if (TestClearPageReferenced(page))
- referenced++;
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked)
+ return 1;
+ }
- if (page_mapped(page) && page->mapping) {
- if (PageAnon(page))
- referenced += page_referenced_anon(page);
- else if (is_locked)
- referenced += page_referenced_file(page);
- else if (TestSetPageLocked(page))
- referenced++;
- else {
- if (page->mapping)
- referenced += page_referenced_file(page);
- unlock_page(page);
- }
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg) {
+ rwc.invalid_vma = invalid_page_referenced_vma;
}
- return referenced;
+
+ ret = rmap_walk(page, &rwc);
+ *vm_flags = pra.vm_flags;
+
+ if (we_locked)
+ unlock_page(page);
+
+ return pra.referenced;
}
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
- pte_t *pte, entry;
+ pte_t *pte;
spinlock_t *ptl;
int ret = 0;
+ int *cleaned = arg;
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
-
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
goto out;
- if (!pte_dirty(*pte) && !pte_write(*pte))
- goto unlock;
+ if (pte_dirty(*pte) || pte_write(*pte)) {
+ pte_t entry;
- entry = ptep_get_and_clear(mm, address, pte);
- entry = pte_mkclean(entry);
- entry = pte_wrprotect(entry);
- ptep_establish(vma, address, pte, entry);
- lazy_mmu_prot_update(entry);
- ret = 1;
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ entry = ptep_clear_flush(vma, address, pte);
+ entry = pte_wrprotect(entry);
+ entry = pte_mkclean(entry);
+ set_pte_at(mm, address, pte, entry);
+ ret = 1;
+ }
-unlock:
pte_unmap_unlock(pte, ptl);
+
+ if (ret) {
+ mmu_notifier_invalidate_page(mm, address);
+ (*cleaned)++;
+ }
out:
- return ret;
+ return SWAP_AGAIN;
}
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
- int ret = 0;
+ if (vma->vm_flags & VM_SHARED)
+ return false;
- BUG_ON(PageAnon(page));
-
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- if (vma->vm_flags & VM_SHARED)
- ret += page_mkclean_one(page, vma);
- }
- spin_unlock(&mapping->i_mmap_lock);
- return ret;
+ return true;
}
int page_mkclean(struct page *page)
{
- int ret = 0;
+ int cleaned = 0;
+ struct address_space *mapping;
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&cleaned,
+ .rmap_one = page_mkclean_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
BUG_ON(!PageLocked(page));
- if (page_mapped(page)) {
- struct address_space *mapping = page_mapping(page);
- if (mapping)
- ret = page_mkclean_file(mapping, page);
- }
+ if (!page_mapped(page))
+ return 0;
- return ret;
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ rmap_walk(page, &rwc);
+
+ return cleaned;
}
+EXPORT_SYMBOL_GPL(page_mkclean);
/**
- * page_set_anon_rmap - setup new anonymous rmap
- * @page: the page to add the mapping to
- * @vma: the vm area in which the mapping is added
+ * page_move_anon_rmap - move a page to our anon_vma
+ * @page: the page to move to our anon_vma
+ * @vma: the vma the page belongs to
* @address: the user virtual address mapped
+ *
+ * When a page belongs exclusively to one process after a COW event,
+ * that page can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling
+ * processes.
*/
-static void __page_set_anon_rmap(struct page *page,
+void page_move_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
- BUG_ON(!anon_vma);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON(!anon_vma);
+ VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
+
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
+}
+
+/**
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page: Page to add to rmap
+ * @vma: VM area to add page to.
+ * @address: User virtual address of the mapping
+ * @exclusive: the page is exclusively owned by the current process
+ */
+static void __page_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+ if (PageAnon(page))
+ return;
+
+ /*
+ * If the page isn't exclusively mapped into this vma,
+ * we must use the _oldest_ possible anon_vma for the
+ * page mapping!
+ */
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
+}
+/**
+ * __page_check_anon_rmap - sanity check anonymous rmap addition
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+ */
+static void __page_check_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
/*
- * nr_mapped state can be updated without turning off
- * interrupts because it is not modified via interrupt.
+ * The page's anon-rmap details (mapping and index) are guaranteed to
+ * be set up correctly at this point.
+ *
+ * We have exclusion against page_add_anon_rmap because the caller
+ * always holds the page locked, except if called from page_dup_rmap,
+ * in which case the page is already known to be setup.
+ *
+ * We have exclusion against page_add_new_anon_rmap because those pages
+ * are initially only visible via the pagetables, and the pte is locked
+ * over the call to page_add_new_anon_rmap.
*/
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
+ BUG_ON(page->index != linear_page_index(vma, address));
+#endif
}
/**
@@ -529,17 +967,51 @@ static void __page_set_anon_rmap(struct page *page,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the pte lock.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting,
+ * and to ensure that PageAnon is not being upgraded racily to PageKsm
+ * (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- if (atomic_inc_and_test(&page->_mapcount))
- __page_set_anon_rmap(page, vma, address);
- /* else checking page index and mapping is racy */
+ do_page_add_anon_rmap(page, vma, address, 0);
}
/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ int first = atomic_inc_and_test(&page->_mapcount);
+ if (first) {
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption
+ * disabled.
+ */
+ if (PageTransHuge(page))
+ __inc_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ hpage_nr_pages(page));
+ }
+ if (unlikely(PageKsm(page)))
+ return;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ /* address might be in next vma when migration races vma_adjust */
+ if (first)
+ __page_set_anon_rmap(page, vma, address, exclusive);
+ else
+ __page_check_anon_rmap(page, vma, address);
+}
+
+/**
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
@@ -547,12 +1019,38 @@ void page_add_anon_rmap(struct page *page,
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
+ * Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
- __page_set_anon_rmap(page, vma, address);
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ SetPageSwapBacked(page);
+ atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+ if (PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ hpage_nr_pages(page));
+ __page_set_anon_rmap(page, vma, address, 1);
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+ SetPageActive(page);
+ lru_cache_add(page);
+ return;
+ }
+
+ if (!TestSetPageMlocked(page)) {
+ /*
+ * We use the irq-unsafe __mod_zone_page_stat because this
+ * counter is not modified from interrupt context, and the pte
+ * lock is held(spinlock), which implies preemption disabled.
+ */
+ __mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ add_page_to_unevictable_list(page);
}
/**
@@ -563,8 +1061,15 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- if (atomic_inc_and_test(&page->_mapcount))
+ bool locked;
+ unsigned long flags;
+
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ }
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/**
@@ -575,51 +1080,76 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
- if (atomic_add_negative(-1, &page->_mapcount)) {
-#ifdef CONFIG_DEBUG_VM
- if (unlikely(page_mapcount(page) < 0)) {
- printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
- printk (KERN_EMERG " page->flags = %lx\n", page->flags);
- printk (KERN_EMERG " page->count = %x\n", page_count(page));
- printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
- }
-#endif
- BUG_ON(page_mapcount(page) < 0);
- /*
- * It would be tidy to reset the PageAnon mapping here,
- * but that might overwrite a racing page_add_anon_rmap
- * which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_hot_cold_page,
- * and remember that it's only reliable while mapped.
- * Leaving it set also helps swapoff to reinstate ptes
- * faster for those pages still in swapcache.
- */
- if (page_test_and_clear_dirty(page))
- set_page_dirty(page);
- __dec_zone_page_state(page,
- PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+ bool anon = PageAnon(page);
+ bool locked;
+ unsigned long flags;
+
+ /*
+ * The anon case has no mem_cgroup page_stat to update; but may
+ * uncharge_page() below, where the lock ordering can deadlock if
+ * we hold the lock against page_stat move: so avoid it on anon.
+ */
+ if (!anon)
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+
+ /*
+ * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+ * and not charged by memcg for now.
+ *
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ if (unlikely(PageHuge(page)))
+ goto out;
+ if (anon) {
+ mem_cgroup_uncharge_page(page);
+ if (PageTransHuge(page))
+ __dec_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ -hpage_nr_pages(page));
+ } else {
+ __dec_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+ /*
+ * It would be tidy to reset the PageAnon mapping here,
+ * but that might overwrite a racing page_add_anon_rmap
+ * which increments mapcount after us but sets mapping
+ * before us: so leave the reset to free_hot_cold_page,
+ * and remember that it's only reliable while mapped.
+ * Leaving it set also helps swapoff to reinstate ptes
+ * faster for those pages still in swapcache.
+ */
+ return;
+out:
+ if (!anon)
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
/*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ * @arg: enum ttu_flags will be passed to this argument
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- int migration)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long address;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
+ enum ttu_flags flags = (enum ttu_flags)arg;
- address = vma_address(page, vma);
- if (address == -EFAULT)
- goto out;
-
- pte = page_check_address(page, mm, address, &ptl);
+ pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -628,11 +1158,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
- if (!migration && ((vma->vm_flags & VM_LOCKED) ||
- (ptep_clear_flush_young(vma, address, pte)))) {
- ret = SWAP_FAIL;
- goto out_unmap;
+ if (!(flags & TTU_IGNORE_MLOCK)) {
+ if (vma->vm_flags & VM_LOCKED)
+ goto out_mlock;
+
+ if (flags & TTU_MUNLOCK)
+ goto out_unmap;
}
+ if (!(flags & TTU_IGNORE_ACCESS)) {
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
@@ -645,54 +1183,100 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageAnon(page)) {
+ if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (!PageHuge(page)) {
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+ }
+ set_pte_at(mm, address, pte,
+ swp_entry_to_pte(make_hwpoison_entry(page)));
+ } else if (pte_unused(pteval)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ */
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+ } else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
+ pte_t swp_pte;
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- swap_duplicate(entry);
+ if (swap_duplicate(entry) < 0) {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- dec_mm_counter(mm, anon_rss);
-#ifdef CONFIG_MIGRATION
- } else {
+ dec_mm_counter(mm, MM_ANONPAGES);
+ inc_mm_counter(mm, MM_SWAPENTS);
+ } else if (IS_ENABLED(CONFIG_MIGRATION)) {
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- BUG_ON(!migration);
+ BUG_ON(!(flags & TTU_MIGRATION));
entry = make_migration_entry(page, pte_write(pteval));
-#endif
}
- set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, address, pte, swp_pte);
BUG_ON(pte_file(*pte));
- } else
-#ifdef CONFIG_MIGRATION
- if (migration) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
} else
-#endif
- dec_mm_counter(mm, file_rss);
-
+ dec_mm_counter(mm, MM_FILEPAGES);
page_remove_rmap(page);
page_cache_release(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
+ if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
+
+out_mlock:
+ pte_unmap_unlock(pte, ptl);
+
+
+ /*
+ * We need mmap_sem locking, Otherwise VM_LOCKED check makes
+ * unstable result and race. Plus, We can't wait here because
+ * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
+ * if trylock failed, the page remain in evictable lru and later
+ * vmscan could retry to move the page to unevictable lru if the
+ * page is actually mlocked.
+ */
+ if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ if (vma->vm_flags & VM_LOCKED) {
+ mlock_vma_page(page);
+ ret = SWAP_MLOCK;
+ }
+ up_read(&vma->vm_mm->mmap_sem);
+ }
+ return ret;
}
/*
@@ -713,23 +1297,30 @@ out:
* For very sparsely populated VMAs this is a little inefficient - chances are
* there there won't be many ptes located within the scan cluster. In this case
* maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
+ * acquire it without blocking. If vma locked, mlock the pages in the cluster,
+ * rather than unmapping them. If we encounter the "check_page" that vmscan is
+ * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
*/
#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
-static void try_to_unmap_cluster(unsigned long cursor,
- unsigned int *mapcount, struct vm_area_struct *vma)
+static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
+ struct vm_area_struct *vma, struct page *check_page)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
struct page *page;
unsigned long address;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
unsigned long end;
+ int ret = SWAP_AGAIN;
+ int locked_vma = 0;
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
@@ -738,17 +1329,23 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (end > vma->vm_end)
end = vma->vm_end;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return;
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
+ return ret;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
+ mmun_start = address;
+ mmun_end = end;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return;
+ /*
+ * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+ * keep the sem while scanning the cluster for mlocking pages.
+ */
+ if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+ locked_vma = (vma->vm_flags & VM_LOCKED);
+ if (!locked_vma)
+ up_read(&vma->vm_mm->mmap_sem); /* don't need it */
+ }
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -761,7 +1358,24 @@ static void try_to_unmap_cluster(unsigned long cursor,
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
- if (ptep_clear_flush_young(vma, address, pte))
+ if (locked_vma) {
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
+ ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ continue; /* don't unmap */
+ }
+
+ if (ptep_clear_flush_young_notify(vma, address, pte))
continue;
/* Nuke the page table entry. */
@@ -769,8 +1383,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
- if (page->index != linear_page_index(vma, address))
- set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
+ if (page->index != linear_page_index(vma, address)) {
+ pte_t ptfile = pgoff_to_pte(page->index);
+ if (pte_soft_dirty(pteval))
+ ptfile = pte_file_mksoft_dirty(ptfile);
+ set_pte_at(mm, address, pte, ptfile);
+ }
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -778,66 +1396,29 @@ static void try_to_unmap_cluster(unsigned long cursor,
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ dec_mm_counter(mm, MM_FILEPAGES);
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
-}
-
-static int try_to_unmap_anon(struct page *page, int migration)
-{
- struct anon_vma *anon_vma;
- struct vm_area_struct *vma;
- int ret = SWAP_AGAIN;
-
- anon_vma = page_lock_anon_vma(page);
- if (!anon_vma)
- return ret;
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- ret = try_to_unmap_one(page, vma, migration);
- if (ret == SWAP_FAIL || !page_mapped(page))
- break;
- }
- spin_unlock(&anon_vma->lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ if (locked_vma)
+ up_read(&vma->vm_mm->mmap_sem);
return ret;
}
-/**
- * try_to_unmap_file - unmap file page using the object-based rmap method
- * @page: the page to unmap
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap for object-based pages.
- */
-static int try_to_unmap_file(struct page *page, int migration)
+static int try_to_unmap_nonlinear(struct page *page,
+ struct address_space *mapping, void *arg)
{
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
- ret = try_to_unmap_one(page, vma, migration);
- if (ret == SWAP_FAIL || !page_mapped(page))
- goto out;
- }
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
-
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
- if ((vma->vm_flags & VM_LOCKED) && !migration)
- continue;
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -846,9 +1427,8 @@ static int try_to_unmap_file(struct page *page, int migration)
max_nl_size = cursor;
}
- if (max_nl_size == 0) { /* any nonlinears locked or reserved */
- ret = SWAP_FAIL;
- goto out;
+ if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
+ return SWAP_FAIL;
}
/*
@@ -860,30 +1440,32 @@ static int try_to_unmap_file(struct page *page, int migration)
*/
mapcount = page_mapcount(page);
if (!mapcount)
- goto out;
- cond_resched_lock(&mapping->i_mmap_lock);
+ return ret;
+
+ cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
if (max_nl_cursor == 0)
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
- if ((vma->vm_flags & VM_LOCKED) && !migration)
- continue;
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
cursor = (unsigned long) vma->vm_private_data;
- while ( cursor < max_nl_cursor &&
+ while (cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
- try_to_unmap_cluster(cursor, &mapcount, vma);
+ if (try_to_unmap_cluster(cursor, &mapcount,
+ vma, page) == SWAP_MLOCK)
+ ret = SWAP_MLOCK;
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
- goto out;
+ return ret;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
- cond_resched_lock(&mapping->i_mmap_lock);
+ cond_resched();
max_nl_cursor += CLUSTER_SIZE;
} while (max_nl_cursor <= max_nl_size);
@@ -892,16 +1474,40 @@ static int try_to_unmap_file(struct page *page, int migration)
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
-out:
- spin_unlock(&mapping->i_mmap_lock);
+
return ret;
}
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+ return is_vma_temporary_stack(vma);
+}
+
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
+ * @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
@@ -910,20 +1516,259 @@ out:
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
+ * SWAP_MLOCK - page is mlocked.
*/
-int try_to_unmap(struct page *page, int migration)
+int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .file_nonlinear = try_to_unmap_nonlinear,
+ .anon_lock = page_lock_anon_vma_read,
+ };
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
- if (PageAnon(page))
- ret = try_to_unmap_anon(page, migration);
- else
- ret = try_to_unmap_file(page, migration);
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
+ rwc.invalid_vma = invalid_migration_vma;
- if (!page_mapped(page))
+ ret = rmap_walk(page, &rwc);
+
+ if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
}
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code. Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ *
+ * Return values are:
+ *
+ * SWAP_AGAIN - no vma is holding page mlocked, or,
+ * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL - page cannot be located at present
+ * SWAP_MLOCK - page is now mlocked.
+ */
+int try_to_munlock(struct page *page)
+{
+ int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)TTU_MUNLOCK,
+ .done = page_not_mapped,
+ /*
+ * We don't bother to try to find the munlocked page in
+ * nonlinears. It's costly. Instead, later, page reclaim logic
+ * may call try_to_unmap() and recover PG_mlocked lazily.
+ */
+ .file_nonlinear = NULL,
+ .anon_lock = page_lock_anon_vma_read,
+
+ };
+
+ VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+
+ ret = rmap_walk(page, &rwc);
+ return ret;
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma)
+{
+ struct anon_vma *root = anon_vma->root;
+
+ anon_vma_free(anon_vma);
+ if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ anon_vma_free(root);
+}
+
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+ struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+
+ if (rwc->anon_lock)
+ return rwc->anon_lock(page);
+
+ /*
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+ * because that depends on page_mapped(); but not all its usages
+ * are holding mmap_sem. Users without mmap_sem are required to
+ * take a reference count to prevent the anon_vma disappearing
+ */
+ anon_vma = page_anon_vma(page);
+ if (!anon_vma)
+ return NULL;
+
+ anon_vma_lock_read(anon_vma);
+ return anon_vma;
+}
+
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
+
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (!anon_vma)
+ return ret;
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long address = vma_address(page, vma);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
+ if (ret != SWAP_AGAIN)
+ break;
+ if (rwc->done && rwc->done(page))
+ break;
+ }
+ anon_vma_unlock_read(anon_vma);
+ return ret;
+}
+
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct address_space *mapping = page->mapping;
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct vm_area_struct *vma;
+ int ret = SWAP_AGAIN;
+
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_mutex.
+ */
+ VM_BUG_ON(!PageLocked(page));
+
+ if (!mapping)
+ return ret;
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ unsigned long address = vma_address(page, vma);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
+ if (ret != SWAP_AGAIN)
+ goto done;
+ if (rwc->done && rwc->done(page))
+ goto done;
+ }
+
+ if (!rwc->file_nonlinear)
+ goto done;
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto done;
+
+ ret = rwc->file_nonlinear(page, mapping, rwc->arg);
+
+done:
+ mutex_unlock(&mapping->i_mmap_mutex);
+ return ret;
+}
+
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+{
+ if (unlikely(PageKsm(page)))
+ return rmap_walk_ksm(page, rwc);
+ else if (PageAnon(page))
+ return rmap_walk_anon(page, rwc);
+ else
+ return rmap_walk_file(page, rwc);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+
+ if (PageAnon(page))
+ return;
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!anon_vma);
+ /* address might be in next vma when migration races vma_adjust */
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0);
+ __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index eda907c3a86..af68b15a8fc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
* 2000-2001 Christoph Rohland
* 2000-2001 SAP AG
* 2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
* Copyright (C) 2002-2005 VERITAS Software Corporation.
* Copyright (C) 2004 Andi Kleen, SuSE Labs
*
@@ -14,31 +15,48 @@
* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*
+ * tiny-shmem:
+ * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
+ *
* This file is released under the GPL.
*/
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/aio.h>
+
+static struct vfsmount *shm_mnt;
+
+#ifdef CONFIG_SHMEM
/*
* This virtual memory filesystem is heavily based on the ramfs. It
* extends ramfs by the ability to use swap and honor resource limits
* which makes it a completely usable filesystem.
*/
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/swap.h>
-#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
-#include <linux/mount.h>
#include <linux/writeback.h>
-#include <linux/vfs.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
+#include <linux/falloc.h>
+#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
@@ -46,89 +64,66 @@
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
+#include <linux/seq_file.h>
+#include <linux/magic.h>
#include <asm/uaccess.h>
-#include <asm/div64.h>
#include <asm/pgtable.h>
-/* This magic number is used in glibc for posix shared memory */
-#define TMPFS_MAGIC 0x01021994
-
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
-
-#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
-
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN VM_READ
-#define SHMEM_TRUNCATE VM_WRITE
-
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT 64
-
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
+
+/*
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+ wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
+ pgoff_t start; /* start of range currently being fallocated */
+ pgoff_t next; /* the next page offset to be fallocated */
+ pgoff_t nr_falloced; /* how many new pages have been fallocated */
+ pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+};
+
+/* Flag allocation requirements to shmem_getpage */
enum sgp_type {
- SGP_QUICK, /* don't try more than file page cache lookup */
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
- SGP_WRITE, /* may exceed i_size, may allocate page */
+ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
+ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
+ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
};
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type);
-
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
- /*
- * The above definition of ENTRIES_PER_PAGE, and the use of
- * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
- * might be reconsidered if it ever diverges from PAGE_SIZE.
- */
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static inline void shmem_dir_free(struct page *page)
-{
- __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-
-static struct page **shmem_dir_map(struct page *page)
-{
- return (struct page **)kmap_atomic(page, KM_USER0);
-}
-
-static inline void shmem_dir_unmap(struct page **dir)
+#ifdef CONFIG_TMPFS
+static unsigned long shmem_default_max_blocks(void)
{
- kunmap_atomic(dir, KM_USER0);
+ return totalram_pages / 2;
}
-static swp_entry_t *shmem_swp_map(struct page *page)
+static unsigned long shmem_default_max_inodes(void)
{
- return (swp_entry_t *)kmap_atomic(page, KM_USER1);
+ return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
}
+#endif
-static inline void shmem_swp_balance_unmap(void)
-{
- /*
- * When passing a pointer to an i_direct entry, to code which
- * also handles indirect entries and so will shmem_swp_unmap,
- * we must arrange for the preempt count to remain in balance.
- * What kmap_atomic of a lowmem page does depends on config
- * and architecture, so pretend to kmap_atomic some lowmem page.
- */
- (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index);
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
-static inline void shmem_swp_unmap(swp_entry_t *entry)
+static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, int *fault_type)
{
- kunmap_atomic(entry, KM_USER1);
+ return shmem_getpage_gfp(inode, index, pagep, sgp,
+ mapping_gfp_mask(inode->i_mapping), fault_type);
}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -144,13 +139,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
*/
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
- return (flags & VM_ACCOUNT)?
- security_vm_enough_memory(VM_ACCT(size)): 0;
+ return (flags & VM_NORESERVE) ?
+ 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
- if (flags & VM_ACCOUNT)
+ if (!(flags & VM_NORESERVE))
vm_unacct_memory(VM_ACCT(size));
}
@@ -162,46 +157,59 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
*/
static inline int shmem_acct_block(unsigned long flags)
{
- return (flags & VM_ACCOUNT)?
- 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+ return (flags & VM_NORESERVE) ?
+ security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
- if (!(flags & VM_ACCOUNT))
+ if (flags & VM_NORESERVE)
vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}
-static struct super_operations shmem_ops;
+static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
-static struct file_operations shmem_file_operations;
-static struct inode_operations shmem_inode_operations;
-static struct inode_operations shmem_dir_inode_operations;
-static struct vm_operations_struct shmem_vm_ops;
+static const struct file_operations shmem_file_operations;
+static const struct inode_operations shmem_inode_operations;
+static const struct inode_operations shmem_dir_inode_operations;
+static const struct inode_operations shmem_special_inode_operations;
+static const struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
- .unplug_io_fn = default_unplug_io_fn,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
};
static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
-static void shmem_free_blocks(struct inode *inode, long pages)
+static int shmem_reserve_inode(struct super_block *sb)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
+ spin_lock(&sbinfo->stat_lock);
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ spin_unlock(&sbinfo->stat_lock);
+ }
+ return 0;
+}
+
+static void shmem_free_inode(struct super_block *sb)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ if (sbinfo->max_inodes) {
spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks += pages;
- inode->i_blocks -= pages*BLOCKS_PER_PAGE;
+ sbinfo->free_inodes++;
spin_unlock(&sbinfo->stat_lock);
}
}
-/*
- * shmem_recalc_inode - recalculate the size of an inode
- *
+/**
+ * shmem_recalc_inode - recalculate the block usage of an inode
* @inode: inode to recalc
*
* We have to calculate the free blocks since the mm can drop
@@ -219,603 +227,493 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed;
+ inode->i_blocks -= freed * BLOCKS_PER_PAGE;
shmem_unacct_blocks(info->flags, freed);
- shmem_free_blocks(inode, freed);
}
}
/*
- * shmem_swp_entry - find the swap vector position in the info structure
- *
- * @info: info structure for the inode
- * @index: index of the page to find
- * @page: optional page to add to the structure. Has to be preset to
- * all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
+ * Replace item expected in radix tree by a new item, while holding tree lock.
+ */
+static int shmem_radix_tree_replace(struct address_space *mapping,
+ pgoff_t index, void *expected, void *replacement)
+{
+ void **pslot;
+ void *item;
+
+ VM_BUG_ON(!expected);
+ VM_BUG_ON(!replacement);
+ pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ if (!pslot)
+ return -ENOENT;
+ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
+ if (item != expected)
+ return -ENOENT;
+ radix_tree_replace_slot(pslot, replacement);
+ return 0;
+}
+
+/*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
*
- * i_indirect -> dir --> 16-19
- * | +-> 20-23
- * |
- * +-->dir2 --> 24-27
- * | +-> 28-31
- * | +-> 32-35
- * | +-> 36-39
- * |
- * +-->dir3 --> 40-43
- * +-> 44-47
- * +-> 48-51
- * +-> 52-55
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
*/
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+static bool shmem_confirm_swap(struct address_space *mapping,
+ pgoff_t index, swp_entry_t swap)
{
- unsigned long offset;
- struct page **dir;
- struct page *subdir;
+ void *item;
- if (index < SHMEM_NR_DIRECT) {
- shmem_swp_balance_unmap();
- return info->i_direct+index;
- }
- if (!info->i_indirect) {
- if (page) {
- info->i_indirect = *page;
- *page = NULL;
- }
- return NULL; /* need another page */
- }
+ rcu_read_lock();
+ item = radix_tree_lookup(&mapping->page_tree, index);
+ rcu_read_unlock();
+ return item == swp_to_radix_entry(swap);
+}
- index -= SHMEM_NR_DIRECT;
- offset = index % ENTRIES_PER_PAGE;
- index /= ENTRIES_PER_PAGE;
- dir = shmem_dir_map(info->i_indirect);
-
- if (index >= ENTRIES_PER_PAGE/2) {
- index -= ENTRIES_PER_PAGE/2;
- dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
- index %= ENTRIES_PER_PAGE;
- subdir = *dir;
- if (!subdir) {
- if (page) {
- *dir = *page;
- *page = NULL;
- }
- shmem_dir_unmap(dir);
- return NULL; /* need another page */
- }
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
- }
+/*
+ * Like add_to_page_cache_locked, but error if expected item has gone.
+ */
+static int shmem_add_to_page_cache(struct page *page,
+ struct address_space *mapping,
+ pgoff_t index, gfp_t gfp, void *expected)
+{
+ int error;
- dir += index;
- subdir = *dir;
- if (!subdir) {
- if (!page || !(subdir = *page)) {
- shmem_dir_unmap(dir);
- return NULL; /* need a page */
- }
- *dir = subdir;
- *page = NULL;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (!expected)
+ error = radix_tree_insert(&mapping->page_tree, index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index, expected,
+ page);
+ if (!error) {
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- shmem_dir_unmap(dir);
- return shmem_swp_map(subdir) + offset;
+ return error;
}
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
{
- long incdec = value? 1: -1;
+ struct address_space *mapping = page->mapping;
+ int error;
- entry->val = value;
- info->swapped += incdec;
- if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
- struct page *page = kmap_atomic_to_page(entry);
- set_page_private(page, page_private(page) + incdec);
- }
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ page->mapping = NULL;
+ mapping->nrpages--;
+ __dec_zone_page_state(page, NR_FILE_PAGES);
+ __dec_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
+ BUG_ON(error);
}
/*
- * shmem_swp_alloc - get the position of the swap entry for the page.
- * If it does not exist allocate the entry.
- *
- * @info: info structure for the inode
- * @index: index of the page to find
- * @sgp: check and recheck i_size? skip allocation?
+ * Remove swap entry from radix tree, free the swap and its page cache.
*/
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
+static int shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
{
- struct inode *inode = &info->vfs_inode;
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- struct page *page = NULL;
- swp_entry_t *entry;
+ void *old;
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return ERR_PTR(-EINVAL);
+ spin_lock_irq(&mapping->tree_lock);
+ old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
+ spin_unlock_irq(&mapping->tree_lock);
+ if (old != radswap)
+ return -ENOENT;
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return 0;
+}
- while (!(entry = shmem_swp_entry(info, index, &page))) {
- if (sgp == SGP_READ)
- return shmem_swp_map(ZERO_PAGE(0));
+/*
+ * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
+ */
+void shmem_unlock_mapping(struct address_space *mapping)
+{
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index = 0;
+
+ pagevec_init(&pvec, 0);
+ /*
+ * Minor point, but we might as well stop if someone else SHM_LOCKs it.
+ */
+ while (!mapping_unevictable(mapping)) {
/*
- * Test free_blocks against 1 not 0, since we have 1 data
- * page (and perhaps indirect index pages) yet to allocate:
- * a waste to allocate index if we cannot allocate data.
+ * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
+ * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
*/
- if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks <= 1) {
- spin_unlock(&sbinfo->stat_lock);
- return ERR_PTR(-ENOSPC);
- }
- sbinfo->free_blocks--;
- inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
- }
-
- spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
- if (page)
- set_page_private(page, 0);
- spin_lock(&info->lock);
-
- if (!page) {
- shmem_free_blocks(inode, 1);
- return ERR_PTR(-ENOMEM);
- }
- if (sgp != SGP_WRITE &&
- ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
- entry = ERR_PTR(-EINVAL);
+ pvec.nr = find_get_entries(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
+ if (!pvec.nr)
break;
- }
- if (info->next_index <= index)
- info->next_index = index + 1;
- }
- if (page) {
- /* another task gave its page, or truncated the file */
- shmem_free_blocks(inode, 1);
- shmem_dir_free(page);
+ index = indices[pvec.nr - 1] + 1;
+ pagevec_remove_exceptionals(&pvec);
+ check_move_unevictable_pages(pvec.pages, pvec.nr);
+ pagevec_release(&pvec);
+ cond_resched();
}
- if (info->next_index <= index && !IS_ERR(entry))
- info->next_index = index + 1;
- return entry;
}
/*
- * shmem_free_swp - free some swap entries in a directory
- *
- * @dir: pointer to the directory
- * @edir: pointer after last entry of the directory
+ * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+ bool unfalloc)
{
- swp_entry_t *ptr;
- int freed = 0;
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
+ unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+ unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ long nr_swaps_freed = 0;
+ pgoff_t index;
+ int i;
+
+ if (lend == -1)
+ end = -1; /* unsigned, so actually very big */
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index < end) {
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
+ if (!pvec.nr)
+ break;
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ index = indices[i];
+ if (index >= end)
+ break;
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val) {
- free_swap_and_cache(*ptr);
- *ptr = (swp_entry_t){0};
- freed++;
+ if (radix_tree_exceptional_entry(page)) {
+ if (unfalloc)
+ continue;
+ nr_swaps_freed += !shmem_free_swap(mapping,
+ index, page);
+ continue;
+ }
+
+ if (!trylock_page(page))
+ continue;
+ if (!unfalloc || !PageUptodate(page)) {
+ if (page->mapping == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ truncate_inode_page(mapping, page);
+ }
+ }
+ unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+ index++;
}
- return freed;
-}
-
-static int shmem_map_and_free_swp(struct page *subdir,
- int offset, int limit, struct page ***dir)
-{
- swp_entry_t *ptr;
- int freed = 0;
-
- ptr = shmem_swp_map(subdir);
- for (; offset < limit; offset += LATENCY_LIMIT) {
- int size = limit - offset;
- if (size > LATENCY_LIMIT)
- size = LATENCY_LIMIT;
- freed += shmem_free_swp(ptr+offset, ptr+offset+size);
- if (need_resched()) {
- shmem_swp_unmap(ptr);
- if (*dir) {
- shmem_dir_unmap(*dir);
- *dir = NULL;
+
+ if (partial_start) {
+ struct page *page = NULL;
+ shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ if (page) {
+ unsigned int top = PAGE_CACHE_SIZE;
+ if (start > end) {
+ top = partial_end;
+ partial_end = 0;
}
- cond_resched();
- ptr = shmem_swp_map(subdir);
+ zero_user_segment(page, partial_start, top);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
}
}
- shmem_swp_unmap(ptr);
- return freed;
-}
-
-static void shmem_free_pages(struct list_head *next)
-{
- struct page *page;
- int freed = 0;
-
- do {
- page = container_of(next, struct page, lru);
- next = next->next;
- shmem_dir_free(page);
- freed++;
- if (freed >= LATENCY_LIMIT) {
- cond_resched();
- freed = 0;
+ if (partial_end) {
+ struct page *page = NULL;
+ shmem_getpage(inode, end, &page, SGP_READ, NULL);
+ if (page) {
+ zero_user_segment(page, 0, partial_end);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
}
- } while (next);
-}
-
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
-{
- struct shmem_inode_info *info = SHMEM_I(inode);
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- unsigned long diroff;
- struct page **dir;
- struct page *topdir;
- struct page *middir;
- struct page *subdir;
- swp_entry_t *ptr;
- LIST_HEAD(pages_to_free);
- long nr_pages_to_free = 0;
- long nr_swaps_freed = 0;
- int offset;
- int freed;
- int punch_hole = 0;
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (idx >= info->next_index)
+ }
+ if (start >= end)
return;
- spin_lock(&info->lock);
- info->flags |= SHMEM_TRUNCATE;
- if (likely(end == (loff_t) -1)) {
- limit = info->next_index;
- info->next_index = idx;
- } else {
- limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (limit > info->next_index)
- limit = info->next_index;
- punch_hole = 1;
- }
+ index = start;
+ while (index < end) {
+ cond_resched();
- topdir = info->i_indirect;
- if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
- info->i_indirect = NULL;
- nr_pages_to_free++;
- list_add(&topdir->lru, &pages_to_free);
- }
- spin_unlock(&info->lock);
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
+ if (!pvec.nr) {
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
+ break;
+ /* But if truncating, restart to make sure all gone */
+ index = start;
+ continue;
+ }
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- if (info->swapped && idx < SHMEM_NR_DIRECT) {
- ptr = info->i_direct;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
- }
- if (!topdir)
- goto done2;
-
- BUG_ON(limit <= SHMEM_NR_DIRECT);
- limit -= SHMEM_NR_DIRECT;
- idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
- offset = idx % ENTRIES_PER_PAGE;
- idx -= offset;
-
- dir = shmem_dir_map(topdir);
- stage = ENTRIES_PER_PAGEPAGE/2;
- if (idx < ENTRIES_PER_PAGEPAGE/2) {
- middir = topdir;
- diroff = idx/ENTRIES_PER_PAGE;
- } else {
- dir += ENTRIES_PER_PAGE/2;
- dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
- while (stage <= idx)
- stage += ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- if (*dir) {
- diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
- ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
- if (!diroff && !offset) {
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (radix_tree_exceptional_entry(page)) {
+ if (unfalloc)
+ continue;
+ if (shmem_free_swap(mapping, index, page)) {
+ /* Swap was replaced by page: retry */
+ index--;
+ break;
+ }
+ nr_swaps_freed++;
+ continue;
}
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(middir);
- } else {
- diroff = 0;
- offset = 0;
- idx = stage;
- }
- }
- for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(topdir) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto done1;
+ lock_page(page);
+ if (!unfalloc || !PageUptodate(page)) {
+ if (page->mapping == mapping) {
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ truncate_inode_page(mapping, page);
+ } else {
+ /* Page was replaced by swap: retry */
+ unlock_page(page);
+ index--;
+ break;
+ }
}
- stage = idx + ENTRIES_PER_PAGEPAGE;
- middir = *dir;
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
- shmem_dir_unmap(dir);
- cond_resched();
- dir = shmem_dir_map(middir);
- diroff = 0;
- }
- subdir = dir[diroff];
- if (subdir && page_private(subdir)) {
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- freed = shmem_map_and_free_swp(subdir,
- offset, size, &dir);
- if (!dir)
- dir = shmem_dir_map(middir);
- nr_swaps_freed += freed;
- if (offset)
- spin_lock(&info->lock);
- set_page_private(subdir, page_private(subdir) - freed);
- if (offset)
- spin_unlock(&info->lock);
- if (!punch_hole)
- BUG_ON(page_private(subdir) > offset);
- }
- if (offset)
- offset = 0;
- else if (subdir && !page_private(subdir)) {
- dir[diroff] = NULL;
- nr_pages_to_free++;
- list_add(&subdir->lru, &pages_to_free);
+ unlock_page(page);
}
- }
-done1:
- shmem_dir_unmap(dir);
-done2:
- if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
- /*
- * Call truncate_inode_pages again: racing shmem_unuse_inode
- * may have swizzled a page in from swap since vmtruncate or
- * generic_delete_inode did it, before we lowered next_index.
- * Also, though shmem_getpage checks i_size before adding to
- * cache, no recheck after: so fix the narrow window there too.
- */
- truncate_inode_pages_range(inode->i_mapping, start, end);
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
}
spin_lock(&info->lock);
- info->flags &= ~SHMEM_TRUNCATE;
info->swapped -= nr_swaps_freed;
- if (nr_pages_to_free)
- shmem_free_blocks(inode, nr_pages_to_free);
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
-
- /*
- * Empty swap vector directory pages to be freed?
- */
- if (!list_empty(&pages_to_free)) {
- pages_to_free.prev->next = NULL;
- shmem_free_pages(pages_to_free.next);
- }
}
-static void shmem_truncate(struct inode *inode)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
- shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+ shmem_undo_range(inode, lstart, lend, false);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
- struct page *page = NULL;
int error;
- if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size < inode->i_size) {
- /*
- * If truncating down to a partial page, then
- * if that page is already allocated, hold it
- * in memory until the truncation is over, so
- * truncate_partial_page cannnot miss it were
- * it assigned to swap.
- */
- if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
- (void) shmem_getpage(inode,
- attr->ia_size>>PAGE_CACHE_SHIFT,
- &page, SGP_READ, NULL);
- }
- /*
- * Reset SHMEM_PAGEIN flag so that shmem_truncate can
- * detect if any pages might have been added to cache
- * after truncate_inode_pages. But we needn't bother
- * if it's being fully truncated to zero-length: the
- * nrpages check is efficient enough in that case.
- */
- if (attr->ia_size) {
- struct shmem_inode_info *info = SHMEM_I(inode);
- spin_lock(&info->lock);
- info->flags &= ~SHMEM_PAGEIN;
- spin_unlock(&info->lock);
- }
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ loff_t oldsize = inode->i_size;
+ loff_t newsize = attr->ia_size;
+
+ if (newsize != oldsize) {
+ i_size_write(inode, newsize);
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ }
+ if (newsize < oldsize) {
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+ shmem_truncate_range(inode, newsize, (loff_t)-1);
+ /* unmap again to remove racily COWed private pages */
+ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
}
}
- error = inode_change_ok(inode, attr);
- if (!error)
- error = inode_setattr(inode, attr);
- if (page)
- page_cache_release(page);
+ setattr_copy(inode, attr);
+ if (attr->ia_valid & ATTR_MODE)
+ error = posix_acl_chmod(inode, inode->i_mode);
return error;
}
-static void shmem_delete_inode(struct inode *inode)
+static void shmem_evict_inode(struct inode *inode)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
- if (inode->i_op->truncate == shmem_truncate) {
- truncate_inode_pages(inode->i_mapping, 0);
+ if (inode->i_mapping->a_ops == &shmem_aops) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
- shmem_truncate(inode);
+ shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->swaplist)) {
- spin_lock(&shmem_swaplist_lock);
+ mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist);
- spin_unlock(&shmem_swaplist_lock);
+ mutex_unlock(&shmem_swaplist_mutex);
}
- }
- BUG_ON(inode->i_blocks);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
+ } else
+ kfree(info->symlink);
+
+ simple_xattrs_free(&info->xattrs);
+ WARN_ON(inode->i_blocks);
+ shmem_free_inode(inode->i_sb);
clear_inode(inode);
}
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
-{
- swp_entry_t *ptr;
+/*
+ * If swap found in inode, free it and move page from swapcache to filecache.
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
+ swp_entry_t swap, struct page **pagep)
+{
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ void *radswap;
+ pgoff_t index;
+ gfp_t gfp;
+ int error = 0;
+
+ radswap = swp_to_radix_entry(swap);
+ index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ if (index == -1)
+ return 0;
- for (ptr = dir; ptr < edir; ptr++) {
- if (ptr->val == entry.val)
- return ptr - dir;
+ /*
+ * Move _head_ to start search for next from here.
+ * But be careful: shmem_evict_inode checks list_empty without taking
+ * mutex, and there's an instant in list_move_tail when info->swaplist
+ * would appear empty, if it were the only one on shmem_swaplist.
+ */
+ if (shmem_swaplist.next != &info->swaplist)
+ list_move_tail(&shmem_swaplist, &info->swaplist);
+
+ gfp = mapping_gfp_mask(mapping);
+ if (shmem_should_replace_page(*pagep, gfp)) {
+ mutex_unlock(&shmem_swaplist_mutex);
+ error = shmem_replace_page(pagep, gfp, info, index);
+ mutex_lock(&shmem_swaplist_mutex);
+ /*
+ * We needed to drop mutex to make that restrictive page
+ * allocation, but the inode might have been freed while we
+ * dropped it: although a racing shmem_evict_inode() cannot
+ * complete without emptying the radix_tree, our page lock
+ * on this swapcache page is not enough to prevent that -
+ * free_swap_and_cache() of our swap entry will only
+ * trylock_page(), removing swap from radix_tree whatever.
+ *
+ * We must not proceed to shmem_add_to_page_cache() if the
+ * inode has been freed, but of course we cannot rely on
+ * inode or mapping or info to check that. However, we can
+ * safely check if our swap entry is still in use (and here
+ * it can't have got reused for another page): if it's still
+ * in use, then the inode cannot have been freed yet, and we
+ * can safely proceed (if it's no longer in use, that tells
+ * nothing about the inode, but we don't need to unuse swap).
+ */
+ if (!page_swapcount(*pagep))
+ error = -ENOENT;
}
- return -1;
-}
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
-{
- struct inode *inode;
- unsigned long idx;
- unsigned long size;
- unsigned long limit;
- unsigned long stage;
- struct page **dir;
- struct page *subdir;
- swp_entry_t *ptr;
- int offset;
-
- idx = 0;
- ptr = info->i_direct;
- spin_lock(&info->lock);
- limit = info->next_index;
- size = limit;
- if (size > SHMEM_NR_DIRECT)
- size = SHMEM_NR_DIRECT;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0) {
- shmem_swp_balance_unmap();
- goto found;
- }
- if (!info->i_indirect)
- goto lost2;
-
- dir = shmem_dir_map(info->i_indirect);
- stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-
- for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
- if (unlikely(idx == stage)) {
- shmem_dir_unmap(dir-1);
- dir = shmem_dir_map(info->i_indirect) +
- ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- while (!*dir) {
- dir++;
- idx += ENTRIES_PER_PAGEPAGE;
- if (idx >= limit)
- goto lost1;
- }
- stage = idx + ENTRIES_PER_PAGEPAGE;
- subdir = *dir;
- shmem_dir_unmap(dir);
- dir = shmem_dir_map(subdir);
- }
- subdir = *dir;
- if (subdir && page_private(subdir)) {
- ptr = shmem_swp_map(subdir);
- size = limit - idx;
- if (size > ENTRIES_PER_PAGE)
- size = ENTRIES_PER_PAGE;
- offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0) {
- shmem_dir_unmap(dir);
- goto found;
- }
- shmem_swp_unmap(ptr);
- }
- }
-lost1:
- shmem_dir_unmap(dir-1);
-lost2:
- spin_unlock(&info->lock);
- return 0;
-found:
- idx += offset;
- inode = &info->vfs_inode;
- if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, ptr + offset, 0);
- }
- shmem_swp_unmap(ptr);
- spin_unlock(&info->lock);
/*
- * Decrement swap count even when the entry is left behind:
- * try_to_unuse will skip over mms, then reincrement count.
+ * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
+ * but also to hold up shmem_evict_inode(): so inode cannot be freed
+ * beneath us (pagelock doesn't help until the page is in pagecache).
*/
- swap_free(entry);
- return 1;
+ if (!error)
+ error = shmem_add_to_page_cache(*pagep, mapping, index,
+ GFP_NOWAIT, radswap);
+ if (error != -ENOMEM) {
+ /*
+ * Truncation and eviction use free_swap_and_cache(), which
+ * only does trylock page: if we raced, best clean up here.
+ */
+ delete_from_swap_cache(*pagep);
+ set_page_dirty(*pagep);
+ if (!error) {
+ spin_lock(&info->lock);
+ info->swapped--;
+ spin_unlock(&info->lock);
+ swap_free(swap);
+ }
+ error = 1; /* not an error, but entry was found */
+ }
+ return error;
}
/*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
*/
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
{
- struct list_head *p, *next;
+ struct list_head *this, *next;
struct shmem_inode_info *info;
int found = 0;
+ int error = 0;
+
+ /*
+ * There's a faint possibility that swap page was replaced before
+ * caller locked it: caller will come back later with the right page.
+ */
+ if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
+ goto out;
- spin_lock(&shmem_swaplist_lock);
- list_for_each_safe(p, next, &shmem_swaplist) {
- info = list_entry(p, struct shmem_inode_info, swaplist);
- if (!info->swapped)
+ /*
+ * Charge page using GFP_KERNEL while we can wait, before taking
+ * the shmem_swaplist_mutex which might hold up shmem_writepage().
+ * Charged back to the user (not to caller) when swap account is used.
+ */
+ error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
+ if (error)
+ goto out;
+ /* No radix_tree_preload: swap entry keeps a place for page in tree */
+
+ mutex_lock(&shmem_swaplist_mutex);
+ list_for_each_safe(this, next, &shmem_swaplist) {
+ info = list_entry(this, struct shmem_inode_info, swaplist);
+ if (info->swapped)
+ found = shmem_unuse_inode(info, swap, &page);
+ else
list_del_init(&info->swaplist);
- else if (shmem_unuse_inode(info, entry, page)) {
- /* move head to start search for next from here */
- list_move_tail(&shmem_swaplist, &info->swaplist);
- found = 1;
+ cond_resched();
+ if (found)
break;
- }
}
- spin_unlock(&shmem_swaplist_lock);
- return found;
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ if (found < 0)
+ error = found;
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ return error;
}
/*
@@ -824,485 +722,624 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
struct shmem_inode_info *info;
- swp_entry_t *entry, swap;
struct address_space *mapping;
- unsigned long index;
struct inode *inode;
+ swp_entry_t swap;
+ pgoff_t index;
BUG_ON(!PageLocked(page));
- BUG_ON(page_mapped(page));
-
mapping = page->mapping;
index = page->index;
inode = mapping->host;
info = SHMEM_I(inode);
if (info->flags & VM_LOCKED)
goto redirty;
+ if (!total_swap_pages)
+ goto redirty;
+
+ /*
+ * shmem_backing_dev_info's capabilities prevent regular writeback or
+ * sync from ever calling shmem_writepage; but a stacking filesystem
+ * might use ->writepage of its underlying filesystem, in which case
+ * tmpfs should write out to swap only in response to memory pressure,
+ * and not for the writeback threads or sync.
+ */
+ if (!wbc->for_reclaim) {
+ WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
+ goto redirty;
+ }
+
+ /*
+ * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+ * value into swapfile.c, the only way we can correctly account for a
+ * fallocated page arriving here is now to initialize it and write it.
+ *
+ * That's okay for a page already fallocated earlier, but if we have
+ * not yet completed the fallocation, then (a) we want to keep track
+ * of this page in case we have to undo it, and (b) it may not be a
+ * good idea to continue anyway, once we're pushing into swap. So
+ * reactivate the page, and let shmem_fallocate() quit when too many.
+ */
+ if (!PageUptodate(page)) {
+ if (inode->i_private) {
+ struct shmem_falloc *shmem_falloc;
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ !shmem_falloc->waitq &&
+ index >= shmem_falloc->start &&
+ index < shmem_falloc->next)
+ shmem_falloc->nr_unswapped++;
+ else
+ shmem_falloc = NULL;
+ spin_unlock(&inode->i_lock);
+ if (shmem_falloc)
+ goto redirty;
+ }
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
swap = get_swap_page();
if (!swap.val)
goto redirty;
- spin_lock(&info->lock);
- shmem_recalc_inode(inode);
- if (index >= info->next_index) {
- BUG_ON(!(info->flags & SHMEM_TRUNCATE));
- goto unlock;
- }
- entry = shmem_swp_entry(info, index, NULL);
- BUG_ON(!entry);
- BUG_ON(entry->val);
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the page is
+ * moved to swap cache, when its pagelock no longer protects
+ * the inode from eviction. But don't unlock the mutex until
+ * we've incremented swapped, because shmem_unuse_inode() will
+ * prune a !swapped inode from the swaplist under this mutex.
+ */
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add_tail(&info->swaplist, &shmem_swaplist);
- if (move_to_swap_cache(page, swap) == 0) {
- shmem_swp_set(info, entry, swap.val);
- shmem_swp_unmap(entry);
+ if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+ swap_shmem_alloc(swap);
+ shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
+ spin_lock(&info->lock);
+ info->swapped++;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- if (list_empty(&info->swaplist)) {
- spin_lock(&shmem_swaplist_lock);
- /* move instead of add in case we're racing */
- list_move_tail(&info->swaplist, &shmem_swaplist);
- spin_unlock(&shmem_swaplist_lock);
- }
- unlock_page(page);
+
+ mutex_unlock(&shmem_swaplist_mutex);
+ BUG_ON(page_mapped(page));
+ swap_writepage(page, wbc);
return 0;
}
- shmem_swp_unmap(entry);
-unlock:
- spin_unlock(&info->lock);
- swap_free(swap);
+ mutex_unlock(&shmem_swaplist_mutex);
+ swapcache_free(swap, NULL);
redirty:
set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
+ if (wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
+ unlock_page(page);
+ return 0;
}
#ifdef CONFIG_NUMA
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+#ifdef CONFIG_TMPFS
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
- char *nodelist = strchr(value, ':');
- int err = 1;
+ char buffer[64];
- if (nodelist) {
- /* NUL-terminate policy string */
- *nodelist++ = '\0';
- if (nodelist_parse(nodelist, *policy_nodes))
- goto out;
- }
- if (!strcmp(value, "default")) {
- *policy = MPOL_DEFAULT;
- /* Don't allow a nodelist */
- if (!nodelist)
- err = 0;
- } else if (!strcmp(value, "prefer")) {
- *policy = MPOL_PREFERRED;
- /* Insist on a nodelist of one node only */
- if (nodelist) {
- char *rest = nodelist;
- while (isdigit(*rest))
- rest++;
- if (!*rest)
- err = 0;
- }
- } else if (!strcmp(value, "bind")) {
- *policy = MPOL_BIND;
- /* Insist on a nodelist */
- if (nodelist)
- err = 0;
- } else if (!strcmp(value, "interleave")) {
- *policy = MPOL_INTERLEAVE;
- /* Default to nodes online if no nodelist */
- if (!nodelist)
- *policy_nodes = node_online_map;
- err = 0;
+ if (!mpol || mpol->mode == MPOL_DEFAULT)
+ return; /* show nothing */
+
+ mpol_to_str(buffer, sizeof(buffer), mpol);
+
+ seq_printf(seq, ",mpol=%s", buffer);
+}
+
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+ struct mempolicy *mpol = NULL;
+ if (sbinfo->mpol) {
+ spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ mpol = sbinfo->mpol;
+ mpol_get(mpol);
+ spin_unlock(&sbinfo->stat_lock);
}
-out:
- /* Restore string for error message */
- if (nodelist)
- *--nodelist = ':';
- return err;
+ return mpol;
}
+#endif /* CONFIG_TMPFS */
-static struct page *shmem_swapin_async(struct shared_policy *p,
- swp_entry_t entry, unsigned long idx)
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
- struct page *page;
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- pvma.vm_end = PAGE_SIZE;
- pvma.vm_pgoff = idx;
- pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
- page = read_swap_cache_async(entry, &pvma, 0);
- mpol_free(pvma.vm_policy);
- return page;
-}
+ pvma.vm_start = 0;
+ /* Bias interleave by inode number to distribute better across nodes */
+ pvma.vm_pgoff = index + info->vfs_inode.i_ino;
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
-struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
- unsigned long idx)
-{
- struct shared_policy *p = &info->policy;
- int i, num;
- struct page *page;
- unsigned long offset;
+ page = swapin_readahead(swap, gfp, &pvma, 0);
- num = valid_swaphandles(entry, &offset);
- for (i = 0; i < num; offset++, i++) {
- page = shmem_swapin_async(p,
- swp_entry(swp_type(entry), offset), idx);
- if (!page)
- break;
- page_cache_release(page);
- }
- lru_add_drain(); /* Push any new pages onto the LRU now */
- return shmem_swapin_async(p, entry, idx);
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
-static struct page *
-shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
- unsigned long idx)
+static struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
- pvma.vm_pgoff = idx;
- pvma.vm_end = PAGE_SIZE;
- page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
- mpol_free(pvma.vm_policy);
+ /* Create a pseudo vma that just contains the policy */
+ pvma.vm_start = 0;
+ /* Bias interleave by inode number to distribute better across nodes */
+ pvma.vm_pgoff = index + info->vfs_inode.i_ino;
+ pvma.vm_ops = NULL;
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+
+ page = alloc_page_vma(gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
return page;
}
-#else
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+#else /* !CONFIG_NUMA */
+#ifdef CONFIG_TMPFS
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
- return 1;
+}
+#endif /* CONFIG_TMPFS */
+
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ return swapin_readahead(swap, gfp, NULL, 0);
}
-static inline struct page *
-shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+static inline struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
{
- swapin_readahead(entry, 0, NULL);
- return read_swap_cache_async(entry, NULL, 0);
+ return alloc_page(gfp);
}
+#endif /* CONFIG_NUMA */
-static inline struct page *
-shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
- return alloc_page(gfp | __GFP_ZERO);
+ return NULL;
}
#endif
/*
- * shmem_getpage - either get the page from swap or allocate a new one
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to. If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+ return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ struct page *oldpage, *newpage;
+ struct address_space *swap_mapping;
+ pgoff_t swap_index;
+ int error;
+
+ oldpage = *pagep;
+ swap_index = page_private(oldpage);
+ swap_mapping = page_mapping(oldpage);
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success by further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ newpage = shmem_alloc_page(gfp, info, index);
+ if (!newpage)
+ return -ENOMEM;
+
+ page_cache_get(newpage);
+ copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
+
+ __set_page_locked(newpage);
+ SetPageUptodate(newpage);
+ SetPageSwapBacked(newpage);
+ set_page_private(newpage, swap_index);
+ SetPageSwapCache(newpage);
+
+ /*
+ * Our caller will very soon move newpage out of swapcache, but it's
+ * a nice clean interface for us to replace oldpage by newpage there.
+ */
+ spin_lock_irq(&swap_mapping->tree_lock);
+ error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+ newpage);
+ if (!error) {
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ }
+ spin_unlock_irq(&swap_mapping->tree_lock);
+
+ if (unlikely(error)) {
+ /*
+ * Is this possible? I think not, now that our callers check
+ * both PageSwapCache and page_private after getting page lock;
+ * but be defensive. Reverse old to newpage for clear and free.
+ */
+ oldpage = newpage;
+ } else {
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+ *pagep = newpage;
+ }
+
+ ClearPageSwapCache(oldpage);
+ set_page_private(oldpage, 0);
+
+ unlock_page(oldpage);
+ page_cache_release(oldpage);
+ page_cache_release(oldpage);
+ return error;
+}
+
+/*
+ * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp, int *type)
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
- struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo;
- struct page *filepage = *pagep;
- struct page *swappage;
- swp_entry_t *entry;
+ struct page *page;
swp_entry_t swap;
int error;
+ int once = 0;
+ int alloced = 0;
- if (idx >= SHMEM_MAX_INDEX)
+ if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
return -EFBIG;
- /*
- * Normally, filepage is NULL on entry, and either found
- * uptodate immediately, or allocated and zeroed, or read
- * in under swappage, which is then assigned to filepage.
- * But shmem_prepare_write passes in a locked filepage,
- * which may be found not uptodate by other callers too,
- * and may need to be copied from the swappage read in.
- */
repeat:
- if (!filepage)
- filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
- error = 0;
- if (sgp == SGP_QUICK)
- goto failed;
+ swap.val = 0;
+ page = find_lock_entry(mapping, index);
+ if (radix_tree_exceptional_entry(page)) {
+ swap = radix_to_swp_entry(page);
+ page = NULL;
+ }
- spin_lock(&info->lock);
- shmem_recalc_inode(inode);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry)) {
- spin_unlock(&info->lock);
- error = PTR_ERR(entry);
+ if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
goto failed;
}
- swap = *entry;
+
+ if (page && sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ /* fallocated page? */
+ if (page && !PageUptodate(page)) {
+ if (sgp != SGP_READ)
+ goto clear;
+ unlock_page(page);
+ page_cache_release(page);
+ page = NULL;
+ }
+ if (page || (sgp == SGP_READ && !swap.val)) {
+ *pagep = page;
+ return 0;
+ }
+
+ /*
+ * Fast cache lookup did not find it:
+ * bring it back from swap or allocate.
+ */
+ info = SHMEM_I(inode);
+ sbinfo = SHMEM_SB(inode->i_sb);
if (swap.val) {
/* Look it up and read it in.. */
- swappage = lookup_swap_cache(swap);
- if (!swappage) {
- shmem_swp_unmap(entry);
+ page = lookup_swap_cache(swap);
+ if (!page) {
/* here we actually do the io */
- if (type && *type == VM_FAULT_MINOR) {
- __count_vm_event(PGMAJFAULT);
- *type = VM_FAULT_MAJOR;
- }
- spin_unlock(&info->lock);
- swappage = shmem_swapin(info, swap, idx);
- if (!swappage) {
- spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- if (entry->val == swap.val)
- error = -ENOMEM;
- shmem_swp_unmap(entry);
- }
- spin_unlock(&info->lock);
- if (error)
- goto failed;
- goto repeat;
+ if (fault_type)
+ *fault_type |= VM_FAULT_MAJOR;
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
}
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
- goto repeat;
}
/* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_locked(swappage);
- page_cache_release(swappage);
- goto repeat;
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST; /* try again */
+ goto unlock;
}
- if (PageWriteback(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- wait_on_page_writeback(swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- goto repeat;
- }
- if (!PageUptodate(swappage)) {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
+ if (!PageUptodate(page)) {
error = -EIO;
goto failed;
}
+ wait_on_page_writeback(page);
- if (filepage) {
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- delete_from_swap_cache(swappage);
- spin_unlock(&info->lock);
- copy_highpage(filepage, swappage);
- unlock_page(swappage);
- page_cache_release(swappage);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
- set_page_dirty(filepage);
- swap_free(swap);
- } else if (!(error = move_from_swap_cache(
- swappage, idx, mapping))) {
- info->flags |= SHMEM_PAGEIN;
- shmem_swp_set(info, entry, 0);
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- filepage = swappage;
- swap_free(swap);
- } else {
- shmem_swp_unmap(entry);
- spin_unlock(&info->lock);
- unlock_page(swappage);
- page_cache_release(swappage);
- if (error == -ENOMEM) {
- /* let kswapd refresh zone for GFP_ATOMICs */
- blk_congestion_wait(WRITE, HZ/50);
- }
- goto repeat;
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
}
- } else if (sgp == SGP_READ && !filepage) {
- shmem_swp_unmap(entry);
- filepage = find_get_page(mapping, idx);
- if (filepage &&
- (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
- spin_unlock(&info->lock);
- wait_on_page_locked(filepage);
- page_cache_release(filepage);
- filepage = NULL;
- goto repeat;
+
+ error = mem_cgroup_charge_file(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, swp_to_radix_entry(swap));
+ /*
+ * We already confirmed swap under page lock, and make
+ * no memory allocation here, so usually no possibility
+ * of error; but free_swap_and_cache() only trylocks a
+ * page, so it is just possible that the entry has been
+ * truncated or holepunched since swap was confirmed.
+ * shmem_undo_range() will have done some of the
+ * unaccounting, now delete_from_swap_cache() will do
+ * the rest (including mem_cgroup_uncharge_swapcache).
+ * Reset swap.val? No, leave it so "failed" goes back to
+ * "repeat": reading a hole and writing should succeed.
+ */
+ if (error)
+ delete_from_swap_cache(page);
}
+ if (error)
+ goto failed;
+
+ spin_lock(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
} else {
- shmem_swp_unmap(entry);
- sbinfo = SHMEM_SB(inode->i_sb);
+ if (shmem_acct_block(info->flags)) {
+ error = -ENOSPC;
+ goto failed;
+ }
if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0 ||
- shmem_acct_block(info->flags)) {
- spin_unlock(&sbinfo->stat_lock);
- spin_unlock(&info->lock);
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0) {
error = -ENOSPC;
- goto failed;
+ goto unacct;
}
- sbinfo->free_blocks--;
- inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
- } else if (shmem_acct_block(info->flags)) {
- spin_unlock(&info->lock);
- error = -ENOSPC;
- goto failed;
+ percpu_counter_inc(&sbinfo->used_blocks);
}
- if (!filepage) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
- info,
- idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
+ page = shmem_alloc_page(gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto decused;
+ }
- spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- if (error || swap.val || 0 != add_to_page_cache_lru(
- filepage, mapping, idx, GFP_ATOMIC)) {
- spin_unlock(&info->lock);
- page_cache_release(filepage);
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- filepage = NULL;
- if (error)
- goto failed;
- goto repeat;
- }
- info->flags |= SHMEM_PAGEIN;
+ __SetPageSwapBacked(page);
+ __set_page_locked(page);
+ if (sgp == SGP_WRITE)
+ init_page_accessed(page);
+
+ error = mem_cgroup_charge_file(page, current->mm,
+ gfp & GFP_RECLAIM_MASK);
+ if (error)
+ goto decused;
+ error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ radix_tree_preload_end();
+ }
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto decused;
}
+ lru_cache_add_anon(page);
+ spin_lock(&info->lock);
info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
spin_unlock(&info->lock);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+ alloced = true;
+
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
+clear:
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+ if (sgp == SGP_DIRTY)
+ set_page_dirty(page);
}
-done:
- if (*pagep != filepage) {
- unlock_page(filepage);
- *pagep = filepage;
+
+ /* Perhaps the file has been truncated since we checked */
+ if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+ ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ error = -EINVAL;
+ if (alloced)
+ goto trunc;
+ else
+ goto failed;
}
+ *pagep = page;
return 0;
+ /*
+ * Error recovery.
+ */
+trunc:
+ info = SHMEM_I(inode);
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+ spin_lock(&info->lock);
+ info->alloced--;
+ inode->i_blocks -= BLOCKS_PER_PAGE;
+ spin_unlock(&info->lock);
+decused:
+ sbinfo = SHMEM_SB(inode->i_sb);
+ if (sbinfo->max_blocks)
+ percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+ shmem_unacct_blocks(info->flags, 1);
failed:
- if (*pagep != filepage) {
- unlock_page(filepage);
- page_cache_release(filepage);
+ if (swap.val && error != -EINVAL &&
+ !shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
}
+ if (error == -ENOSPC && !once++) {
+ info = SHMEM_I(inode);
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+ goto repeat;
+ }
+ if (error == -EEXIST) /* from above or from radix_tree_insert */
+ goto repeat;
return error;
}
-struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
- struct page *page = NULL;
- unsigned long idx;
+ struct inode *inode = file_inode(vma->vm_file);
int error;
+ int ret = VM_FAULT_LOCKED;
- idx = (address - vma->vm_start) >> PAGE_SHIFT;
- idx += vma->vm_pgoff;
- idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return NOPAGE_SIGBUS;
-
- error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
- if (error)
- return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
-
- mark_page_accessed(page);
- return page;
-}
-
-static int shmem_populate(struct vm_area_struct *vma,
- unsigned long addr, unsigned long len,
- pgprot_t prot, unsigned long pgoff, int nonblock)
-{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
- struct mm_struct *mm = vma->vm_mm;
- enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
- unsigned long size;
+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_mutex in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT(shmem_fault_wait);
+
+ ret = VM_FAULT_NOPAGE;
+ if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+ !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* It's polite to up mmap_sem if we can */
+ up_read(&vma->vm_mm->mmap_sem);
+ ret = VM_FAULT_RETRY;
+ }
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
- return -EINVAL;
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
- while ((long) len > 0) {
- struct page *page = NULL;
- int err;
- /*
- * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
- */
- err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
- if (err)
- return err;
- /* Page may still be null, but only if nonblock was set. */
- if (page) {
- mark_page_accessed(page);
- err = install_page(mm, vma, addr, page, prot);
- if (err) {
- page_cache_release(page);
- return err;
- }
- } else if (vma->vm_flags & VM_NONLINEAR) {
- /* No page was found just because we can't read it in
- * now (being here implies nonblock != 0), but the page
- * may exist, so set the PTE to fault it in later. */
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ spin_unlock(&inode->i_lock);
+ return ret;
}
+ spin_unlock(&inode->i_lock);
+ }
- len -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+ if (error)
+ return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+
+ if (ret & VM_FAULT_MAJOR) {
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
}
- return 0;
+ return ret;
}
#ifdef CONFIG_NUMA
-int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
- return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+ struct inode *inode = file_inode(vma->vm_file);
+ return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}
-struct mempolicy *
-shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct inode *i = vma->vm_file->f_dentry->d_inode;
- unsigned long idx;
+ struct inode *inode = file_inode(vma->vm_file);
+ pgoff_t index;
- idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+ index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}
#endif
int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
@@ -1311,67 +1348,67 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
info->flags |= VM_LOCKED;
+ mapping_set_unevictable(file->f_mapping);
}
if (!lock && (info->flags & VM_LOCKED) && user) {
user_shm_unlock(inode->i_size, user);
info->flags &= ~VM_LOCKED;
+ mapping_clear_unevictable(file->f_mapping);
}
retval = 0;
+
out_nomem:
spin_unlock(&info->lock);
return retval;
}
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
return 0;
}
-static struct inode *
-shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+ umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return NULL;
- }
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
- }
+ if (shmem_reserve_inode(sb))
+ return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current->fsuid;
- inode->i_gid = current->fsgid;
+ inode->i_ino = get_next_ino();
+ inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
- inode->i_mapping->a_ops = &shmem_aops;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = get_seconds();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
+ simple_xattrs_init(&info->xattrs);
+ cache_no_acl(inode);
switch (mode & S_IFMT) {
default:
+ inode->i_op = &shmem_special_inode_operations;
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
+ inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
- mpol_shared_policy_init(&info->policy, sbinfo->policy,
- &sbinfo->policy_nodes);
+ mpol_shared_policy_init(&info->policy,
+ shmem_get_sbmpol(sbinfo));
break;
case S_IFDIR:
- inode->i_nlink++;
+ inc_nlink(inode);
/* Some things misbehave if size == 0 on a directory */
inode->i_size = 2 * BOGO_DIRENT_SIZE;
inode->i_op = &shmem_dir_inode_operations;
@@ -1382,148 +1419,91 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
* Must not load anything in the rbtree,
* mpol_free_shared_policy will not be called.
*/
- mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
- NULL);
+ mpol_shared_policy_init(&info->policy, NULL);
break;
}
- } else if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
+ } else
+ shmem_free_inode(sb);
return inode;
}
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->backing_dev_info == &shmem_backing_dev_info;
+}
+
#ifdef CONFIG_TMPFS
-static struct inode_operations shmem_symlink_inode_operations;
-static struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_symlink_inode_operations;
+static const struct inode_operations shmem_short_symlink_operations;
+
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#else
+#define shmem_initxattrs NULL
+#endif
-/*
- * Normally tmpfs makes no use of shmem_prepare_write, but it
- * lets a tmpfs file be used read-write below the loop driver.
- */
static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+shmem_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- struct inode *inode = page->mapping->host;
- return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+ struct inode *inode = mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
}
-static ssize_t
-shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- struct inode *inode = file->f_dentry->d_inode;
- loff_t pos;
- unsigned long written;
- ssize_t err;
-
- if ((ssize_t) count < 0)
- return -EINVAL;
-
- if (!access_ok(VERIFY_READ, buf, count))
- return -EFAULT;
-
- mutex_lock(&inode->i_mutex);
-
- pos = *ppos;
- written = 0;
-
- err = generic_write_checks(file, &pos, &count, 0);
- if (err || !count)
- goto out;
-
- err = remove_suid(file->f_dentry);
- if (err)
- goto out;
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-
- do {
- struct page *page = NULL;
- unsigned long bytes, index, offset;
- char *kaddr;
- int left;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- /*
- * We don't hold page lock across copy from user -
- * what would it guard against? - so no deadlock here.
- * But it still may be a good idea to prefault below.
- */
-
- err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
- if (err)
- break;
+ struct inode *inode = mapping->host;
- left = bytes;
- if (PageHighMem(page)) {
- volatile unsigned char dummy;
- __get_user(dummy, buf);
- __get_user(dummy, buf + bytes - 1);
+ if (pos + copied > inode->i_size)
+ i_size_write(inode, pos + copied);
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic(kaddr + offset,
- buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- }
- if (left) {
- kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap(page);
+ if (!PageUptodate(page)) {
+ if (copied < PAGE_CACHE_SIZE) {
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ zero_user_segments(page, 0, from,
+ from + copied, PAGE_CACHE_SIZE);
}
+ SetPageUptodate(page);
+ }
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
- written += bytes;
- count -= bytes;
- pos += bytes;
- buf += bytes;
- if (pos > inode->i_size)
- i_size_write(inode, pos);
-
- flush_dcache_page(page);
- set_page_dirty(page);
- mark_page_accessed(page);
- page_cache_release(page);
-
- if (left) {
- pos -= left;
- written -= left;
- err = -EFAULT;
- break;
- }
-
- /*
- * Our dirty pages are not counted in nr_dirty,
- * and we do not attempt to balance dirty pages.
- */
-
- cond_resched();
- } while (count);
-
- *ppos = pos;
- if (written)
- err = written;
-out:
- mutex_unlock(&inode->i_mutex);
- return err;
+ return copied;
}
-static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct inode *inode = filp->f_dentry->d_inode;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- unsigned long index, offset;
+ pgoff_t index;
+ unsigned long offset;
+ enum sgp_type sgp = SGP_READ;
+ int error = 0;
+ ssize_t retval = 0;
+ loff_t *ppos = &iocb->ki_pos;
+
+ /*
+ * Might this read be for a stacking filesystem? Then when reading
+ * holes of a sparse file, we actually need to allocate those pages,
+ * and even mark them dirty, so it cannot exceed the max_blocks limit.
+ */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ sgp = SGP_DIRTY;
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
for (;;) {
struct page *page = NULL;
- unsigned long end_index, nr, ret;
+ pgoff_t end_index;
+ unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1535,12 +1515,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
break;
}
- desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
- if (desc->error) {
- if (desc->error == -EINVAL)
- desc->error = 0;
+ error = shmem_getpage(inode, index, &page, sgp, NULL);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
break;
}
+ if (page)
+ unlock_page(page);
/*
* We must evaluate after, since reads (unlike writes)
@@ -1580,68 +1562,346 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
+ ret = copy_page_to_iter(page, offset, nr, to);
+ retval += ret;
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
- if (ret != nr || !desc->count)
+ if (!iov_iter_count(to))
break;
-
+ if (ret < nr) {
+ error = -EFAULT;
+ break;
+ }
cond_resched();
}
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- file_accessed(filp);
+ file_accessed(file);
+ return retval ? retval : error;
}
-static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
- read_descriptor_t desc;
-
- if ((ssize_t) count < 0)
- return -EINVAL;
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
- if (!count)
+ struct address_space *mapping = in->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned int loff, nr_pages, req_pages;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct page *page;
+ pgoff_t index, end_index;
+ loff_t isize, left;
+ int error, page_nr;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
+ .flags = flags,
+ .ops = &page_cache_pipe_buf_ops,
+ .spd_release = spd_release_page,
+ };
+
+ isize = i_size_read(inode);
+ if (unlikely(*ppos >= isize))
return 0;
- desc.written = 0;
- desc.count = count;
- desc.arg.buf = buf;
- desc.error = 0;
+ left = isize - *ppos;
+ if (unlikely(left < len))
+ len = left;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ loff = *ppos & ~PAGE_CACHE_MASK;
+ req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ nr_pages = min(req_pages, spd.nr_pages_max);
+
+ spd.nr_pages = find_get_pages_contig(mapping, index,
+ nr_pages, spd.pages);
+ index += spd.nr_pages;
+ error = 0;
+
+ while (spd.nr_pages < nr_pages) {
+ error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ spd.pages[spd.nr_pages++] = page;
+ index++;
+ }
+
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ nr_pages = spd.nr_pages;
+ spd.nr_pages = 0;
+
+ for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ unsigned int this_len;
+
+ if (!len)
+ break;
+
+ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ page = spd.pages[page_nr];
- do_shmem_file_read(filp, ppos, &desc, file_read_actor);
- if (desc.written)
- return desc.written;
- return desc.error;
+ if (!PageUptodate(page) || page->mapping != mapping) {
+ error = shmem_getpage(inode, index, &page,
+ SGP_CACHE, NULL);
+ if (error)
+ break;
+ unlock_page(page);
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
+ }
+
+ isize = i_size_read(inode);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index))
+ break;
+
+ if (end_index == index) {
+ unsigned int plen;
+
+ plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ if (plen <= loff)
+ break;
+
+ this_len = min(this_len, plen - loff);
+ len = this_len;
+ }
+
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
+ len -= this_len;
+ loff = 0;
+ spd.nr_pages++;
+ index++;
+ }
+
+ while (page_nr < nr_pages)
+ page_cache_release(spd.pages[page_nr++]);
+
+ if (spd.nr_pages)
+ error = splice_to_pipe(pipe, &spd);
+
+ splice_shrink_spd(&spd);
+
+ if (error > 0) {
+ *ppos += error;
+ file_accessed(in);
+ }
+ return error;
}
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pgoff_t index, pgoff_t end, int whence)
{
- read_descriptor_t desc;
+ struct page *page;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool done = false;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+ pvec.nr = find_get_entries(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (whence == SEEK_DATA)
+ index = end;
+ break;
+ }
+ for (i = 0; i < pvec.nr; i++, index++) {
+ if (index < indices[i]) {
+ if (whence == SEEK_HOLE) {
+ done = true;
+ break;
+ }
+ index = indices[i];
+ }
+ page = pvec.pages[i];
+ if (page && !radix_tree_exceptional_entry(page)) {
+ if (!PageUptodate(page))
+ page = NULL;
+ }
+ if (index >= end ||
+ (page && whence == SEEK_DATA) ||
+ (!page && whence == SEEK_HOLE)) {
+ done = true;
+ break;
+ }
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+ }
+ return index;
+}
- if (!count)
- return 0;
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t start, end;
+ loff_t new_offset;
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
+ if (whence != SEEK_DATA && whence != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, whence,
+ MAX_LFS_FILESIZE, i_size_read(inode));
+ mutex_lock(&inode->i_mutex);
+ /* We're holding i_mutex so we can access i_size directly */
+
+ if (offset < 0)
+ offset = -EINVAL;
+ else if (offset >= inode->i_size)
+ offset = -ENXIO;
+ else {
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ new_offset = shmem_seek_hole_data(mapping, start, end, whence);
+ new_offset <<= PAGE_CACHE_SHIFT;
+ if (new_offset > offset) {
+ if (new_offset < inode->i_size)
+ offset = new_offset;
+ else if (whence == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ offset = inode->i_size;
+ }
+ }
- do_shmem_file_read(in_file, ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
+ if (offset >= 0)
+ offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct shmem_falloc shmem_falloc;
+ pgoff_t start, index, end;
+ int error;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ struct address_space *mapping = file->f_mapping;
+ loff_t unmap_start = round_up(offset, PAGE_SIZE);
+ loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+ shmem_falloc.waitq = &shmem_falloc_waitq;
+ shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ shmem_truncate_range(inode, offset, offset + len - 1);
+ /* No need to unmap again: hole-punching leaves COWed pages */
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&shmem_falloc_waitq);
+ spin_unlock(&inode->i_lock);
+ error = 0;
+ goto out;
+ }
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ /* Try to avoid a swapstorm if len is impossible to satisfy */
+ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ shmem_falloc.waitq = NULL;
+ shmem_falloc.start = start;
+ shmem_falloc.next = start;
+ shmem_falloc.nr_falloced = 0;
+ shmem_falloc.nr_unswapped = 0;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
+
+ for (index = start; index < end; index++) {
+ struct page *page;
+
+ /*
+ * Good, the fallocate(2) manpage permits EINTR: we may have
+ * been interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current))
+ error = -EINTR;
+ else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+ error = -ENOMEM;
+ else
+ error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+ NULL);
+ if (error) {
+ /* Remove the !PageUptodate pages we added */
+ shmem_undo_range(inode,
+ (loff_t)start << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_CACHE_SHIFT, true);
+ goto undone;
+ }
+
+ /*
+ * Inform shmem_writepage() how far we have reached.
+ * No need for lock or barrier: we have the page lock.
+ */
+ shmem_falloc.next++;
+ if (!PageUptodate(page))
+ shmem_falloc.nr_falloced++;
+
+ /*
+ * If !PageUptodate, leave it that way so that freeable pages
+ * can be recognized if we need to rollback on error later.
+ * But set_page_dirty so that memory pressure will swap rather
+ * than free the pages we are allocating (and SGP_CACHE pages
+ * might still be clean: we now need to mark those dirty too).
+ */
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+ cond_resched();
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+undone:
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
}
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1651,17 +1911,17 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX;
- spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
- buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+ buf->f_bavail =
+ buf->f_bfree = sbinfo->max_blocks -
+ percpu_counter_sum(&sbinfo->used_blocks);
}
if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
- spin_unlock(&sbinfo->stat_lock);
return 0;
}
@@ -1669,46 +1929,70 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
- struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
+ struct inode *inode;
int error = -ENOSPC;
+ inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- error = 0;
- }
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+
+ error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
return error;
+out_iput:
+ iput(inode);
+ return error;
+}
+
+static int
+shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+ int error = -ENOSPC;
+
+ inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+ if (inode) {
+ error = security_inode_init_security(inode, dir,
+ NULL,
+ shmem_initxattrs, NULL);
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
+ d_tmpfile(dentry, inode);
+ }
+ return error;
+out_iput:
+ iput(inode);
+ return error;
}
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int error;
if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
return error;
- dir->i_nlink++;
+ inc_nlink(dir);
return 0;
}
-static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
+static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
}
@@ -1719,48 +2003,37 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ int ret;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
*/
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
- }
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
- }
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- inode->i_nlink++;
- atomic_inc(&inode->i_count); /* New dentry reference */
+ inc_nlink(inode);
+ ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
- return 0;
+out:
+ return ret;
}
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
- }
- }
+ if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
+ shmem_free_inode(inode->i_sb);
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- inode->i_nlink--;
+ drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
}
@@ -1770,8 +2043,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
if (!simple_empty(dentry))
return -ENOTEMPTY;
- dentry->d_inode->i_nlink--;
- dir->i_nlink--;
+ drop_nlink(dentry->d_inode);
+ drop_nlink(dir);
return shmem_unlink(dir, dentry);
}
@@ -1792,10 +2065,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs)
- old_dir->i_nlink--;
+ drop_nlink(old_dir);
} else if (they_are_dirs) {
- old_dir->i_nlink--;
- new_dir->i_nlink++;
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
}
old_dir->i_size -= BOGO_DIRENT_SIZE;
@@ -1811,7 +2084,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
int error;
int len;
struct inode *inode;
- struct page *page = NULL;
+ struct page *page;
char *kaddr;
struct shmem_inode_info *info;
@@ -1819,12 +2092,12 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (len > PAGE_CACHE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
+ shmem_initxattrs, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1835,25 +2108,29 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
info = SHMEM_I(inode);
inode->i_size = len-1;
- if (len <= (char *)inode - (char *)info) {
- /* do it inline */
- memcpy(info, symname, len);
- inode->i_op = &shmem_symlink_inline_operations;
+ if (len <= SHORT_SYMLINK_LEN) {
+ info->symlink = kmemdup(symname, len, GFP_KERNEL);
+ if (!info->symlink) {
+ iput(inode);
+ return -ENOMEM;
+ }
+ inode->i_op = &shmem_short_symlink_operations;
} else {
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
iput(inode);
return error;
}
+ inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, symname, len);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
set_page_dirty(page);
+ unlock_page(page);
page_cache_release(page);
}
- if (dir->i_mode & S_ISGID)
- inode->i_gid = dir->i_gid;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
d_instantiate(dentry, inode);
@@ -1861,17 +2138,19 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return 0;
}
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
{
- nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+ nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
return NULL;
}
static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct page *page = NULL;
- int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
- nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+ nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
+ if (page)
+ unlock_page(page);
return page;
}
@@ -1885,23 +2164,248 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
}
}
-static struct inode_operations shmem_symlink_inline_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+/*
+ * Superblocks without xattr inode operations may get some security.* xattr
+ * support from the LSM "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+/*
+ * Callback for security_inode_init_security() for acquiring xattrs.
+ */
+static int shmem_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ const struct xattr *xattr;
+ struct simple_xattr *new_xattr;
+ size_t len;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
+ if (!new_xattr)
+ return -ENOMEM;
+
+ len = strlen(xattr->name) + 1;
+ new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
+ GFP_KERNEL);
+ if (!new_xattr->name) {
+ kfree(new_xattr);
+ return -ENOMEM;
+ }
+
+ memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
+ memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
+ xattr->name, len);
+
+ simple_xattr_list_add(&info->xattrs, new_xattr);
+ }
+
+ return 0;
+}
+
+static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ NULL
+};
+
+static int shmem_xattr_validate(const char *name)
+{
+ struct { const char *prefix; size_t len; } arr[] = {
+ { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
+ { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ size_t preflen = arr[i].len;
+ if (strncmp(name, arr[i].prefix, preflen) == 0) {
+ if (!name[preflen])
+ return -EINVAL;
+ return 0;
+ }
+ }
+ return -EOPNOTSUPP;
+}
+
+static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, buffer, size);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return simple_xattr_get(&info->xattrs, name, buffer, size);
+}
+
+static int shmem_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return simple_xattr_set(&info->xattrs, name, value, size, flags);
+}
+
+static int shmem_removexattr(struct dentry *dentry, const char *name)
+{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ int err;
+
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ err = shmem_xattr_validate(name);
+ if (err)
+ return err;
+
+ return simple_xattr_remove(&info->xattrs, name);
+}
+
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ return simple_xattr_list(&info->xattrs, buffer, size);
+}
+#endif /* CONFIG_TMPFS_XATTR */
+
+static const struct inode_operations shmem_short_symlink_operations = {
.readlink = generic_readlink,
- .follow_link = shmem_follow_link_inline,
+ .follow_link = shmem_follow_short_symlink,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
};
-static struct inode_operations shmem_symlink_inode_operations = {
- .truncate = shmem_truncate,
+static const struct inode_operations shmem_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = shmem_follow_link,
.put_link = shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
};
-static int shmem_parse_options(char *options, int *mode, uid_t *uid,
- gid_t *gid, unsigned long *blocks, unsigned long *inodes,
- int *policy, nodemask_t *policy_nodes)
+static struct dentry *shmem_get_parent(struct dentry *child)
+{
+ return ERR_PTR(-ESTALE);
+}
+
+static int shmem_match(struct inode *ino, void *vfh)
+{
+ __u32 *fh = vfh;
+ __u64 inum = fh[2];
+ inum = (inum << 32) | fh[1];
+ return ino->i_ino == inum && fh[0] == ino->i_generation;
+}
+
+static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct inode *inode;
+ struct dentry *dentry = NULL;
+ u64 inum;
+
+ if (fh_len < 3)
+ return NULL;
+
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
+ inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
+ shmem_match, fid->raw);
+ if (inode) {
+ dentry = d_find_alias(inode);
+ iput(inode);
+ }
+
+ return dentry;
+}
+
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+ struct inode *parent)
+{
+ if (*len < 3) {
+ *len = 3;
+ return FILEID_INVALID;
+ }
+
+ if (inode_unhashed(inode)) {
+ /* Unfortunately insert_inode_hash is not idempotent,
+ * so as we hash inodes here rather than at creation
+ * time, we need a lock to ensure we only try
+ * to do it once
+ */
+ static DEFINE_SPINLOCK(lock);
+ spin_lock(&lock);
+ if (inode_unhashed(inode))
+ __insert_inode_hash(inode,
+ inode->i_ino + inode->i_generation);
+ spin_unlock(&lock);
+ }
+
+ fh[0] = inode->i_generation;
+ fh[1] = inode->i_ino;
+ fh[2] = ((__u64)inode->i_ino) >> 32;
+
+ *len = 3;
+ return 1;
+}
+
+static const struct export_operations shmem_export_ops = {
+ .get_parent = shmem_get_parent,
+ .encode_fh = shmem_encode_fh,
+ .fh_to_dentry = shmem_fh_to_dentry,
+};
+
+static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
+ bool remount)
{
char *this_char, *value, *rest;
+ struct mempolicy *mpol = NULL;
+ uid_t uid;
+ gid_t gid;
while (options != NULL) {
this_char = options;
@@ -1928,7 +2432,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
printk(KERN_ERR
"tmpfs: No value for mount option '%s'\n",
this_char);
- return 1;
+ goto error;
}
if (!strcmp(this_char,"size")) {
@@ -1942,47 +2446,59 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
}
if (*rest)
goto bad_val;
- *blocks = size >> PAGE_CACHE_SHIFT;
+ sbinfo->max_blocks =
+ DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
} else if (!strcmp(this_char,"nr_blocks")) {
- *blocks = memparse(value,&rest);
+ sbinfo->max_blocks = memparse(value, &rest);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"nr_inodes")) {
- *inodes = memparse(value,&rest);
+ sbinfo->max_inodes = memparse(value, &rest);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mode")) {
- if (!mode)
+ if (remount)
continue;
- *mode = simple_strtoul(value,&rest,8);
+ sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"uid")) {
- if (!uid)
+ if (remount)
continue;
- *uid = simple_strtoul(value,&rest,0);
+ uid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
+ sbinfo->uid = make_kuid(current_user_ns(), uid);
+ if (!uid_valid(sbinfo->uid))
+ goto bad_val;
} else if (!strcmp(this_char,"gid")) {
- if (!gid)
+ if (remount)
continue;
- *gid = simple_strtoul(value,&rest,0);
+ gid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
+ sbinfo->gid = make_kgid(current_user_ns(), gid);
+ if (!gid_valid(sbinfo->gid))
+ goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (shmem_parse_mpol(value,policy,policy_nodes))
+ mpol_put(mpol);
+ mpol = NULL;
+ if (mpol_parse_str(value, &mpol))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
- return 1;
+ goto error;
}
}
+ sbinfo->mpol = mpol;
return 0;
bad_val:
printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
+error:
+ mpol_put(mpol);
return 1;
}
@@ -1990,69 +2506,95 @@ bad_val:
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- unsigned long max_blocks = sbinfo->max_blocks;
- unsigned long max_inodes = sbinfo->max_inodes;
- int policy = sbinfo->policy;
- nodemask_t policy_nodes = sbinfo->policy_nodes;
- unsigned long blocks;
+ struct shmem_sb_info config = *sbinfo;
unsigned long inodes;
int error = -EINVAL;
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
- &max_inodes, &policy, &policy_nodes))
+ config.mpol = NULL;
+ if (shmem_parse_options(data, &config, true))
return error;
spin_lock(&sbinfo->stat_lock);
- blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (max_blocks < blocks)
+ if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
goto out;
- if (max_inodes < inodes)
+ if (config.max_inodes < inodes)
goto out;
/*
- * Those tests also disallow limited->unlimited while any are in
- * use, so i_blocks will always be zero when max_blocks is zero;
+ * Those tests disallow limited->unlimited while any are in use;
* but we must separately disallow unlimited->limited, because
* in that case we have no record of how much is already in use.
*/
- if (max_blocks && !sbinfo->max_blocks)
+ if (config.max_blocks && !sbinfo->max_blocks)
goto out;
- if (max_inodes && !sbinfo->max_inodes)
+ if (config.max_inodes && !sbinfo->max_inodes)
goto out;
error = 0;
- sbinfo->max_blocks = max_blocks;
- sbinfo->free_blocks = max_blocks - blocks;
- sbinfo->max_inodes = max_inodes;
- sbinfo->free_inodes = max_inodes - inodes;
- sbinfo->policy = policy;
- sbinfo->policy_nodes = policy_nodes;
+ sbinfo->max_blocks = config.max_blocks;
+ sbinfo->max_inodes = config.max_inodes;
+ sbinfo->free_inodes = config.max_inodes - inodes;
+
+ /*
+ * Preserve previous mempolicy unless mpol remount option was specified.
+ */
+ if (config.mpol) {
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
+ }
out:
spin_unlock(&sbinfo->stat_lock);
return error;
}
-#endif
+
+static int shmem_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
+
+ if (sbinfo->max_blocks != shmem_default_max_blocks())
+ seq_printf(seq, ",size=%luk",
+ sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+ if (sbinfo->max_inodes != shmem_default_max_inodes())
+ seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+ if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
+ seq_printf(seq, ",mode=%03ho", sbinfo->mode);
+ if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
+ seq_printf(seq, ",uid=%u",
+ from_kuid_munged(&init_user_ns, sbinfo->uid));
+ if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u",
+ from_kgid_munged(&init_user_ns, sbinfo->gid));
+ shmem_show_mpol(seq, sbinfo->mpol);
+ return 0;
+}
+#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
{
- kfree(sb->s_fs_info);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ percpu_counter_destroy(&sbinfo->used_blocks);
+ mpol_put(sbinfo->mpol);
+ kfree(sbinfo);
sb->s_fs_info = NULL;
}
-static int shmem_fill_super(struct super_block *sb,
- void *data, int silent)
+int shmem_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- struct dentry *root;
- int mode = S_IRWXUGO | S_ISVTX;
- uid_t uid = current->fsuid;
- gid_t gid = current->fsgid;
- int err = -ENOMEM;
struct shmem_sb_info *sbinfo;
- unsigned long blocks = 0;
- unsigned long inodes = 0;
- int policy = MPOL_DEFAULT;
- nodemask_t policy_nodes = node_online_map;
+ int err = -ENOMEM;
+
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ sbinfo->mode = S_IRWXUGO | S_ISVTX;
+ sbinfo->uid = current_fsuid();
+ sbinfo->gid = current_fsgid();
+ sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
/*
@@ -2060,54 +2602,50 @@ static int shmem_fill_super(struct super_block *sb,
* tmpfs instance, limiting inodes to one per page of lowmem;
* but the internal instance is left unlimited.
*/
- if (!(sb->s_flags & MS_NOUSER)) {
- blocks = totalram_pages / 2;
- inodes = totalram_pages - totalhigh_pages;
- if (inodes > blocks)
- inodes = blocks;
- if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
- &inodes, &policy, &policy_nodes))
- return -EINVAL;
+ if (!(sb->s_flags & MS_KERNMOUNT)) {
+ sbinfo->max_blocks = shmem_default_max_blocks();
+ sbinfo->max_inodes = shmem_default_max_inodes();
+ if (shmem_parse_options(data, sbinfo, false)) {
+ err = -EINVAL;
+ goto failed;
+ }
+ } else {
+ sb->s_flags |= MS_NOUSER;
}
+ sb->s_export_op = &shmem_export_ops;
+ sb->s_flags |= MS_NOSEC;
#else
sb->s_flags |= MS_NOUSER;
#endif
- /* Round up to L1_CACHE_BYTES to resist false sharing */
- sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
- L1_CACHE_BYTES), GFP_KERNEL);
- if (!sbinfo)
- return -ENOMEM;
-
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = blocks;
- sbinfo->free_blocks = blocks;
- sbinfo->max_inodes = inodes;
- sbinfo->free_inodes = inodes;
- sbinfo->policy = policy;
- sbinfo->policy_nodes = policy_nodes;
+ if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ goto failed;
+ sbinfo->free_inodes = sbinfo->max_inodes;
- sb->s_fs_info = sbinfo;
- sb->s_maxbytes = SHMEM_MAX_BYTES;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_XATTR
+ sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ sb->s_flags |= MS_POSIXACL;
+#endif
- inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
+ inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
if (!inode)
goto failed;
- inode->i_uid = uid;
- inode->i_gid = gid;
- root = d_alloc_root(inode);
- if (!root)
- goto failed_iput;
- sb->s_root = root;
+ inode->i_uid = sbinfo->uid;
+ inode->i_gid = sbinfo->gid;
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ goto failed;
return 0;
-failed_iput:
- iput(inode);
failed:
shmem_put_super(sb);
return err;
@@ -2117,76 +2655,83 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
- struct shmem_inode_info *p;
- p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
- if (!p)
+ struct shmem_inode_info *info;
+ info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ if (!info)
return NULL;
- return &p->vfs_inode;
+ return &info->vfs_inode;
}
-static void shmem_destroy_inode(struct inode *inode)
+static void shmem_destroy_callback(struct rcu_head *head)
{
- if ((inode->i_mode & S_IFMT) == S_IFREG) {
- /* only struct inode is valid if it's an inline symlink */
- mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- }
+ struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
-static void init_once(void *foo, struct kmem_cache *cachep,
- unsigned long flags)
+static void shmem_destroy_inode(struct inode *inode)
{
- struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
+ if (S_ISREG(inode->i_mode))
+ mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ call_rcu(&inode->i_rcu, shmem_destroy_callback);
+}
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- inode_init_once(&p->vfs_inode);
- }
+static void shmem_init_inode(void *foo)
+{
+ struct shmem_inode_info *info = foo;
+ inode_init_once(&info->vfs_inode);
}
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, 0, init_once, NULL);
- if (shmem_inode_cachep == NULL)
- return -ENOMEM;
+ 0, SLAB_PANIC, shmem_init_inode);
return 0;
}
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
{
kmem_cache_destroy(shmem_inode_cachep);
}
static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .prepare_write = shmem_prepare_write,
- .commit_write = simple_commit_write,
+ .write_begin = shmem_write_begin,
+ .write_end = shmem_write_end,
#endif
.migratepage = migrate_page,
+ .error_remove_page = generic_error_remove_page,
};
-static struct file_operations shmem_file_operations = {
+static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = generic_file_llseek,
- .read = shmem_file_read,
- .write = shmem_file_write,
- .fsync = simple_sync_file,
- .sendfile = shmem_file_sendfile,
+ .llseek = shmem_file_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = shmem_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .fsync = noop_fsync,
+ .splice_read = shmem_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = shmem_fallocate,
#endif
};
-static struct inode_operations shmem_inode_operations = {
- .truncate = shmem_truncate,
- .setattr = shmem_notify_change,
- .truncate_range = shmem_truncate_range,
+static const struct inode_operations shmem_inode_operations = {
+ .setattr = shmem_setattr,
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+ .set_acl = simple_set_acl,
+#endif
};
-static struct inode_operations shmem_dir_inode_operations = {
+static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
.create = shmem_create,
.lookup = simple_lookup,
@@ -2197,61 +2742,93 @@ static struct inode_operations shmem_dir_inode_operations = {
.rmdir = shmem_rmdir,
.mknod = shmem_mknod,
.rename = shmem_rename,
+ .tmpfile = shmem_tmpfile,
+#endif
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
+#endif
+};
+
+static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+ .setxattr = shmem_setxattr,
+ .getxattr = shmem_getxattr,
+ .listxattr = shmem_listxattr,
+ .removexattr = shmem_removexattr,
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ .setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
-static struct super_operations shmem_ops = {
+static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
.remount_fs = shmem_remount_fs,
+ .show_options = shmem_show_options,
#endif
- .delete_inode = shmem_delete_inode,
+ .evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
};
-static struct vm_operations_struct shmem_vm_ops = {
- .nopage = shmem_nopage,
- .populate = shmem_populate,
+static const struct vm_operations_struct shmem_vm_ops = {
+ .fault = shmem_fault,
+ .map_pages = filemap_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+ .remap_pages = generic_file_remap_pages,
};
-
-static int shmem_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .get_sb = shmem_get_sb,
+ .mount = shmem_mount,
.kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
};
-static struct vfsmount *shm_mnt;
-static int __init init_tmpfs(void)
+int __init shmem_init(void)
{
int error;
- error = init_inodecache();
+ /* If rootfs called this, don't re-init */
+ if (shmem_inode_cachep)
+ return 0;
+
+ error = bdi_init(&shmem_backing_dev_info);
+ if (error)
+ goto out4;
+
+ error = shmem_init_inodecache();
if (error)
goto out3;
- error = register_filesystem(&tmpfs_fs_type);
+ error = register_filesystem(&shmem_fs_type);
if (error) {
printk(KERN_ERR "Could not register tmpfs\n");
goto out2;
}
- shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
- tmpfs_fs_type.name, NULL);
+ shm_mnt = kern_mount(&shmem_fs_type);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2260,81 +2837,163 @@ static int __init init_tmpfs(void)
return 0;
out1:
- unregister_filesystem(&tmpfs_fs_type);
+ unregister_filesystem(&shmem_fs_type);
out2:
- destroy_inodecache();
+ shmem_destroy_inodecache();
out3:
+ bdi_destroy(&shmem_backing_dev_info);
+out4:
shm_mnt = ERR_PTR(error);
return error;
}
-module_init(init_tmpfs)
+
+#else /* !CONFIG_SHMEM */
/*
- * shmem_file_setup - get an unlinked file living in tmpfs
- *
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
+ * tiny-shmem: simple shmemfs and tmpfs using ramfs code
*
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
*/
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
+
+static struct file_system_type shmem_fs_type = {
+ .name = "tmpfs",
+ .mount = ramfs_mount,
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+int __init shmem_init(void)
{
- int error;
- struct file *file;
+ BUG_ON(register_filesystem(&shmem_fs_type) != 0);
+
+ shm_mnt = kern_mount(&shmem_fs_type);
+ BUG_ON(IS_ERR(shm_mnt));
+
+ return 0;
+}
+
+int shmem_unuse(swp_entry_t swap, struct page *page)
+{
+ return 0;
+}
+
+int shmem_lock(struct file *file, int lock, struct user_struct *user)
+{
+ return 0;
+}
+
+void shmem_unlock_mapping(struct address_space *mapping)
+{
+}
+
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ truncate_inode_pages_range(inode->i_mapping, lstart, lend);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
+
+#define shmem_vm_ops generic_file_vm_ops
+#define shmem_file_operations ramfs_file_operations
+#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
+#define shmem_acct_size(flags, size) 0
+#define shmem_unacct_size(flags, size) do {} while (0)
+
+#endif /* CONFIG_SHMEM */
+
+/* common code */
+
+static struct dentry_operations anon_ops = {
+ .d_dname = simple_dname
+};
+
+static struct file *__shmem_file_setup(const char *name, loff_t size,
+ unsigned long flags, unsigned int i_flags)
+{
+ struct file *res;
struct inode *inode;
- struct dentry *dentry, *root;
+ struct path path;
+ struct super_block *sb;
struct qstr this;
if (IS_ERR(shm_mnt))
- return (void *)shm_mnt;
+ return ERR_CAST(shm_mnt);
- if (size < 0 || size > SHMEM_MAX_BYTES)
+ if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
- error = -ENOMEM;
+ res = ERR_PTR(-ENOMEM);
this.name = name;
this.len = strlen(name);
this.hash = 0; /* will go */
- root = shm_mnt->mnt_root;
- dentry = d_alloc(root, &this);
- if (!dentry)
+ sb = shm_mnt->mnt_sb;
+ path.dentry = d_alloc_pseudo(sb, &this);
+ if (!path.dentry)
goto put_memory;
+ d_set_d_op(path.dentry, &anon_ops);
+ path.mnt = mntget(shm_mnt);
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
- error = -ENOSPC;
- inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
+ res = ERR_PTR(-ENOSPC);
+ inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
if (!inode)
- goto close_file;
+ goto put_dentry;
- SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
- d_instantiate(dentry, inode);
+ inode->i_flags |= i_flags;
+ d_instantiate(path.dentry, inode);
inode->i_size = size;
- inode->i_nlink = 0; /* It is unlinked */
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &shmem_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
- return file;
-
-close_file:
- put_filp(file);
+ clear_nlink(inode); /* It is unlinked */
+ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
+ if (IS_ERR(res))
+ goto put_dentry;
+
+ res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+ &shmem_file_operations);
+ if (IS_ERR(res))
+ goto put_dentry;
+
+ return res;
+
put_dentry:
- dput(dentry);
+ path_put(&path);
put_memory:
shmem_unacct_size(flags, size);
- return ERR_PTR(error);
+ return res;
}
-/*
+/**
+ * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
+ * kernel internal. There will be NO LSM permission checks against the
+ * underlying inode. So users of this interface must do LSM checks at a
+ * higher layer. The one user is the big_key implementation. LSM checks
+ * are provided at the key level rather than the inode level.
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, S_PRIVATE);
+}
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, 0);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup);
+
+/**
* shmem_zero_setup - setup a shared anonymous mapping
- *
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
int shmem_zero_setup(struct vm_area_struct *vma)
@@ -2352,3 +3011,42 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_ops = &shmem_vm_ops;
return 0;
}
+
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+#ifdef CONFIG_SHMEM
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int error;
+
+ BUG_ON(mapping->a_ops != &shmem_aops);
+ error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ if (error)
+ page = ERR_PTR(error);
+ else
+ unlock_page(page);
+ return page;
+#else
+ /*
+ * The tiny !SHMEM case uses ramfs without swap
+ */
+ return read_cache_page_gfp(mapping, index, gfp);
+#endif
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index 792bfe320a8..3070b929a1b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -26,7 +26,7 @@
* initialized objects.
*
* This means, that your constructor is used only for newly allocated
- * slabs and you must pass objects with the same intializations to
+ * slabs and you must pass objects with the same initializations to
* kmem_cache_free.
*
* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
@@ -68,7 +68,7 @@
* Further notes from the original documentation:
*
* 11 April '97. Started multi-threading - markhe
- * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ * The global cache-chain is protected by the mutex 'slab_mutex'.
* The sem is only needed when accessing/extending the cache-chain, which
* can never happen inside an interrupt (kmem_cache_create(),
* kmem_cache_shrink() and kmem_cache_reap()).
@@ -86,7 +86,6 @@
* All object allocations for a node occur from node specific slab lists.
*/
-#include <linux/config.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/poison.h>
@@ -96,6 +95,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/cpuset.h>
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
@@ -104,19 +104,33 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <linux/nodemask.h>
+#include <linux/kmemleak.h>
#include <linux/mempolicy.h>
#include <linux/mutex.h>
+#include <linux/fault-inject.h>
#include <linux/rtmutex.h>
+#include <linux/reciprocal_div.h>
+#include <linux/debugobjects.h>
+#include <linux/kmemcheck.h>
+#include <linux/memory.h>
+#include <linux/prefetch.h>
+
+#include <net/sock.h>
-#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
+#include <trace/events/kmem.h>
+
+#include "internal.h"
+
+#include "slab.h"
+
/*
- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
- * SLAB_RED_ZONE & SLAB_POISON.
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
*
* STATS - 1 to collect stats for /proc/slabinfo.
@@ -137,115 +151,28 @@
/* Shouldn't this be in a header file somewhere? */
#define BYTES_PER_WORD sizeof(void *)
-
-#ifndef cache_line_size
-#define cache_line_size() L1_CACHE_BYTES
-#endif
-
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
- * Note that this flag disables some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN 0
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
+#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
#ifndef ARCH_KMALLOC_FLAGS
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
-/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
-# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
- SLAB_POISON | SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | \
- SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
+ <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
+
+#if FREELIST_BYTE_INDEX
+typedef unsigned char freelist_idx_t;
#else
-# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
+typedef unsigned short freelist_idx_t;
#endif
-/*
- * kmem_bufctl_t:
- *
- * Bufctl's are used for linking objs within a slab
- * linked offsets.
- *
- * This implementation relies on "struct page" for locating the cache &
- * slab an object belongs to.
- * This allows the bufctl structure to be small (one int), but limits
- * the number of objects a slab (not a cache) can contain when off-slab
- * bufctls are used. The limit is the size of the largest general cache
- * that does not use off-slab slabs.
- * For 32bit archs with 4 kB pages, is this 56.
- * This is not serious, as it is only for large objects, when it is unwise
- * to have too many per slab.
- * Note: This limit can be raised by introducing a general cache whose size
- * is less than 512 (PAGE_SIZE<<3), but greater than 256.
- */
-
-typedef unsigned int kmem_bufctl_t;
-#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
-#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
-#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
-#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
-
-/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
- struct list_head list;
- unsigned long colouroff;
- void *s_mem; /* including colour offset */
- unsigned int inuse; /* num of objs active in slab */
- kmem_bufctl_t free;
- unsigned short nodeid;
-};
+#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
/*
- * struct slab_rcu
- *
- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
- * arrange for kmem_freepages to be called via RCU. This is useful if
- * we need to approach a kernel structure obliquely, from its address
- * obtained without the usual locking. We can lock the structure to
- * stabilize it and check it's still at the given address, only if we
- * can be sure that the memory has not been meanwhile reused for some
- * other kind of object (which our subsystem's lock might corrupt).
- *
- * rcu_read_lock before reading the address, then rcu_read_unlock after
- * taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
*/
-struct slab_rcu {
- struct rcu_head head;
- struct kmem_cache *cachep;
- void *addr;
-};
+static bool pfmemalloc_active __read_mostly;
/*
* struct array_cache
@@ -265,14 +192,34 @@ struct array_cache {
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
- void *entry[0]; /*
+ void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- * [0] is for gcc 2.95. It should really be [].
+ *
+ * Entries should not be directly dereferenced as
+ * entries belonging to slabs marked pfmemalloc will
+ * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
+#define SLAB_OBJ_PFMEMALLOC 1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+ return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+
+static inline void set_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+ return;
+}
+
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
+
/*
* bootstrap: The caches do not work without cpuarrays anymore, but the
* cpuarrays are allocated from the generic caches...
@@ -284,68 +231,27 @@ struct arraycache_init {
};
/*
- * The slab lists for all objects.
- */
-struct kmem_list3 {
- struct list_head slabs_partial; /* partial list first, better asm code */
- struct list_head slabs_full;
- struct list_head slabs_free;
- unsigned long free_objects;
- unsigned int free_limit;
- unsigned int colour_next; /* Per-node cache coloring */
- spinlock_t list_lock;
- struct array_cache *shared; /* shared per node */
- struct array_cache **alien; /* on other nodes */
- unsigned long next_reap; /* updated without locking */
- int free_touched; /* updated without locking */
-};
-
-/*
* Need this for bootstrapping a per node allocator.
*/
-#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
+static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
#define CACHE_CACHE 0
-#define SIZE_AC 1
-#define SIZE_L3 (1 + MAX_NUMNODES)
+#define SIZE_AC MAX_NUMNODES
+#define SIZE_NODE (2 * MAX_NUMNODES)
static int drain_freelist(struct kmem_cache *cache,
- struct kmem_list3 *l3, int tofree);
+ struct kmem_cache_node *n, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
-static int enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
-
-/*
- * This function must be completely optimized away if a constant is passed to
- * it. Mostly the same as what is in linux/slab.h except it returns an index.
- */
-static __always_inline int index_of(const size_t size)
-{
- extern void __bad_size(void);
-
- if (__builtin_constant_p(size)) {
- int i = 0;
-
-#define CACHE(x) \
- if (size <=x) \
- return i; \
- else \
- i++;
-#include "linux/kmalloc_sizes.h"
-#undef CACHE
- __bad_size();
- } else
- __bad_size();
- return 0;
-}
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
+static void cache_reap(struct work_struct *unused);
static int slab_early_init = 1;
-#define INDEX_AC index_of(sizeof(struct arraycache_init))
-#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
+#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
-static void kmem_list3_init(struct kmem_list3 *parent)
+static void kmem_cache_node_init(struct kmem_cache_node *parent)
{
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
@@ -361,7 +267,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define MAKE_LIST(cachep, listp, slab, nodeid) \
do { \
INIT_LIST_HEAD(listp); \
- list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+ list_splice(&(cachep->node[nodeid]->slab), listp); \
} while (0)
#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
@@ -371,79 +277,6 @@ static void kmem_list3_init(struct kmem_list3 *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
-/*
- * struct kmem_cache
- *
- * manages a cache.
- */
-
-struct kmem_cache {
-/* 1) per-cpu data, touched during every alloc/free */
- struct array_cache *array[NR_CPUS];
-/* 2) Cache tunables. Protected by cache_chain_mutex */
- unsigned int batchcount;
- unsigned int limit;
- unsigned int shared;
-
- unsigned int buffer_size;
-/* 3) touched by every alloc & free from the backend */
- struct kmem_list3 *nodelists[MAX_NUMNODES];
-
- unsigned int flags; /* constant flags */
- unsigned int num; /* # of objs per slab */
-
-/* 4) cache_grow/shrink */
- /* order of pgs per slab (2^n) */
- unsigned int gfporder;
-
- /* force GFP flags, e.g. GFP_DMA */
- gfp_t gfpflags;
-
- size_t colour; /* cache colouring range */
- unsigned int colour_off; /* colour offset */
- struct kmem_cache *slabp_cache;
- unsigned int slab_size;
- unsigned int dflags; /* dynamic flags */
-
- /* constructor func */
- void (*ctor) (void *, struct kmem_cache *, unsigned long);
-
- /* de-constructor func */
- void (*dtor) (void *, struct kmem_cache *, unsigned long);
-
-/* 5) cache creation/removal */
- const char *name;
- struct list_head next;
-
-/* 6) statistics */
-#if STATS
- unsigned long num_active;
- unsigned long num_allocations;
- unsigned long high_mark;
- unsigned long grown;
- unsigned long reaped;
- unsigned long errors;
- unsigned long max_freeable;
- unsigned long node_allocs;
- unsigned long node_frees;
- unsigned long node_overflow;
- atomic_t allochit;
- atomic_t allocmiss;
- atomic_t freehit;
- atomic_t freemiss;
-#endif
-#if DEBUG
- /*
- * If debugging is enabled, then the allocator can add additional
- * fields and/or padding to every object. buffer_size contains the total
- * object size including these internal fields, the following two
- * variables contain the offset to the user object and its size.
- */
- int obj_offset;
- int obj_size;
-#endif
-};
-
#define CFLGS_OFF_SLAB (0x80000000UL)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
@@ -455,8 +288,8 @@ struct kmem_cache {
* OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
*/
-#define REAPTIMEOUT_CPUC (2*HZ)
-#define REAPTIMEOUT_LIST3 (4*HZ)
+#define REAPTIMEOUT_AC (2*HZ)
+#define REAPTIMEOUT_NODE (4*HZ)
#if STATS
#define STATS_INC_ACTIVE(x) ((x)->num_active++)
@@ -487,7 +320,7 @@ struct kmem_cache {
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -511,8 +344,8 @@ struct kmem_cache {
* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
* redzone word.
* cachep->obj_offset: The real object.
- * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
* [BYTES_PER_WORD long]
*/
static int obj_offset(struct kmem_cache *cachep)
@@ -520,158 +353,116 @@ static int obj_offset(struct kmem_cache *cachep)
return cachep->obj_offset;
}
-static int obj_size(struct kmem_cache *cachep)
-{
- return cachep->obj_size;
-}
-
-static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
+static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
- return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
+ return (unsigned long long*) (objp + obj_offset(cachep) -
+ sizeof(unsigned long long));
}
-static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
+static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
if (cachep->flags & SLAB_STORE_USER)
- return (unsigned long *)(objp + cachep->buffer_size -
- 2 * BYTES_PER_WORD);
- return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
+ return (unsigned long long *)(objp + cachep->size -
+ sizeof(unsigned long long) -
+ REDZONE_ALIGN);
+ return (unsigned long long *) (objp + cachep->size -
+ sizeof(unsigned long long));
}
static void **dbg_userword(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_STORE_USER));
- return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+ return (void **)(objp + cachep->size - BYTES_PER_WORD);
}
#else
#define obj_offset(x) 0
-#define obj_size(cachep) (cachep->buffer_size)
-#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
-#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
+#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
+#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
#endif
-/*
- * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
- * order.
- */
-#if defined(CONFIG_LARGE_ALLOCS)
-#define MAX_OBJ_ORDER 13 /* up to 32Mb */
-#define MAX_GFP_ORDER 13 /* up to 32Mb */
-#elif defined(CONFIG_MMU)
-#define MAX_OBJ_ORDER 5 /* 32 pages */
-#define MAX_GFP_ORDER 5 /* 32 pages */
-#else
-#define MAX_OBJ_ORDER 8 /* up to 1Mb */
-#define MAX_GFP_ORDER 8 /* up to 1Mb */
-#endif
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
-/*
- * Do not go above this order unless 0 objects fit into the slab.
- */
-#define BREAK_GFP_ORDER_HI 1
-#define BREAK_GFP_ORDER_LO 0
-static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+#ifdef CONFIG_DEBUG_SLAB_LEAK
-/*
- * Functions for storing/retrieving the cachep and or slab from the page
- * allocator. These are used to find the slab an obj belongs to. With kfree(),
- * these are used to find the cache which an obj belongs to.
- */
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
+static void set_obj_status(struct page *page, int idx, int val)
{
- page->lru.next = (struct list_head *)cache;
-}
+ int freelist_size;
+ char *status;
+ struct kmem_cache *cachep = page->slab_cache;
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
- if (unlikely(PageCompound(page)))
- page = (struct page *)page_private(page);
- BUG_ON(!PageSlab(page));
- return (struct kmem_cache *)page->lru.next;
+ freelist_size = cachep->num * sizeof(freelist_idx_t);
+ status = (char *)page->freelist + freelist_size;
+ status[idx] = val;
}
-static inline void page_set_slab(struct page *page, struct slab *slab)
+static inline unsigned int get_obj_status(struct page *page, int idx)
{
- page->lru.prev = (struct list_head *)slab;
-}
+ int freelist_size;
+ char *status;
+ struct kmem_cache *cachep = page->slab_cache;
-static inline struct slab *page_get_slab(struct page *page)
-{
- if (unlikely(PageCompound(page)))
- page = (struct page *)page_private(page);
- BUG_ON(!PageSlab(page));
- return (struct slab *)page->lru.prev;
-}
+ freelist_size = cachep->num * sizeof(freelist_idx_t);
+ status = (char *)page->freelist + freelist_size;
-static inline struct kmem_cache *virt_to_cache(const void *obj)
-{
- struct page *page = virt_to_page(obj);
- return page_get_cache(page);
+ return status[idx];
}
-static inline struct slab *virt_to_slab(const void *obj)
-{
- struct page *page = virt_to_page(obj);
- return page_get_slab(page);
-}
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
-static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
- unsigned int idx)
+#endif
+
+/*
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
+ */
+#define SLAB_MAX_ORDER_HI 1
+#define SLAB_MAX_ORDER_LO 0
+static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
+
+static inline struct kmem_cache *virt_to_cache(const void *obj)
{
- return slab->s_mem + cache->buffer_size * idx;
+ struct page *page = virt_to_head_page(obj);
+ return page->slab_cache;
}
-static inline unsigned int obj_to_index(struct kmem_cache *cache,
- struct slab *slab, void *obj)
+static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
+ unsigned int idx)
{
- return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+ return page->s_mem + cache->size * idx;
}
/*
- * These are the default caches for kmalloc. Custom caches can have other sizes.
+ * We want to avoid an expensive divide : (offset / cache->size)
+ * Using the fact that size is a constant for a particular cache,
+ * we can replace (offset / cache->size) by
+ * reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
-struct cache_sizes malloc_sizes[] = {
-#define CACHE(x) { .cs_size = (x) },
-#include <linux/kmalloc_sizes.h>
- CACHE(ULONG_MAX)
-#undef CACHE
-};
-EXPORT_SYMBOL(malloc_sizes);
-
-/* Must match cache_sizes above. Out of line to keep cache footprint low. */
-struct cache_names {
- char *name;
- char *name_dma;
-};
-
-static struct cache_names __initdata cache_names[] = {
-#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
-#include <linux/kmalloc_sizes.h>
- {NULL,}
-#undef CACHE
-};
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+ const struct page *page, void *obj)
+{
+ u32 offset = (obj - page->s_mem);
+ return reciprocal_divide(offset, cache->reciprocal_buffer_size);
+}
-static struct arraycache_init initarray_cache __initdata =
- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
-static struct kmem_cache cache_cache = {
+static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
- .buffer_size = sizeof(struct kmem_cache),
+ .size = sizeof(struct kmem_cache),
.name = "kmem_cache",
-#if DEBUG
- .obj_size = sizeof(struct kmem_cache),
-#endif
};
#define BAD_ALIEN_MAGIC 0x01020304ul
@@ -692,108 +483,176 @@ static struct kmem_cache cache_cache = {
static struct lock_class_key on_slab_l3_key;
static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
+static struct lock_class_key debugobj_l3_key;
+static struct lock_class_key debugobj_alc_key;
+static void slab_set_lock_classes(struct kmem_cache *cachep,
+ struct lock_class_key *l3_key, struct lock_class_key *alc_key,
+ int q)
{
- int q;
- struct cache_sizes *s = malloc_sizes;
+ struct array_cache **alc;
+ struct kmem_cache_node *n;
+ int r;
- while (s->cs_size != ULONG_MAX) {
- for_each_node(q) {
- struct array_cache **alc;
- int r;
- struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
- if (!l3 || OFF_SLAB(s->cs_cachep))
- continue;
- lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
- alc = l3->alien;
- /*
- * FIXME: This check for BAD_ALIEN_MAGIC
- * should go away when common slab code is taught to
- * work even without alien caches.
- * Currently, non NUMA code returns BAD_ALIEN_MAGIC
- * for alloc_alien_cache,
- */
- if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
- continue;
- for_each_node(r) {
- if (alc[r])
- lockdep_set_class(&alc[r]->lock,
- &on_slab_alc_key);
- }
- }
- s++;
+ n = cachep->node[q];
+ if (!n)
+ return;
+
+ lockdep_set_class(&n->list_lock, l3_key);
+ alc = n->alien;
+ /*
+ * FIXME: This check for BAD_ALIEN_MAGIC
+ * should go away when common slab code is taught to
+ * work even without alien caches.
+ * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+ * for alloc_alien_cache,
+ */
+ if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+ return;
+ for_each_node(r) {
+ if (alc[r])
+ lockdep_set_class(&alc[r]->lock, alc_key);
+ }
+}
+
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+ slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
+}
+
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+ int node;
+
+ for_each_online_node(node)
+ slab_set_debugobj_lock_classes_node(cachep, node);
+}
+
+static void init_node_lock_keys(int q)
+{
+ int i;
+
+ if (slab_state < UP)
+ return;
+
+ for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
+ struct kmem_cache_node *n;
+ struct kmem_cache *cache = kmalloc_caches[i];
+
+ if (!cache)
+ continue;
+
+ n = cache->node[q];
+ if (!n || OFF_SLAB(cache))
+ continue;
+
+ slab_set_lock_classes(cache, &on_slab_l3_key,
+ &on_slab_alc_key, q);
}
}
+
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+ if (!cachep->node[q])
+ return;
+
+ slab_set_lock_classes(cachep, &on_slab_l3_key,
+ &on_slab_alc_key, q);
+}
+
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+ int node;
+
+ VM_BUG_ON(OFF_SLAB(cachep));
+ for_each_node(node)
+ on_slab_lock_classes_node(cachep, node);
+}
+
+static inline void init_lock_keys(void)
+{
+ int node;
+
+ for_each_node(node)
+ init_node_lock_keys(node);
+}
#else
+static void init_node_lock_keys(int q)
+{
+}
+
static inline void init_lock_keys(void)
{
}
-#endif
-/* Guard access to the cache-chain. */
-static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
- NONE,
- PARTIAL_AC,
- PARTIAL_L3,
- FULL
-} g_cpucache_up;
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
{
- return g_cpucache_up == FULL;
}
-static DEFINE_PER_CPU(struct work_struct, reap_work);
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+}
+#endif
+
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
return cachep->array[smp_processor_id()];
}
-static inline struct kmem_cache *__find_general_cachep(size_t size,
- gfp_t gfpflags)
+static size_t calculate_freelist_size(int nr_objs, size_t align)
{
- struct cache_sizes *csizep = malloc_sizes;
+ size_t freelist_size;
-#if DEBUG
- /* This happens if someone tries to call
- * kmem_cache_create(), or __kmalloc(), before
- * the generic caches are initialized.
- */
- BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
-#endif
- while (size > csizep->cs_size)
- csizep++;
+ freelist_size = nr_objs * sizeof(freelist_idx_t);
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ freelist_size += nr_objs * sizeof(char);
- /*
- * Really subtle: The last entry with cs->cs_size==ULONG_MAX
- * has cs_{dma,}cachep==NULL. Thus no special case
- * for large kmalloc calls required.
- */
- if (unlikely(gfpflags & GFP_DMA))
- return csizep->cs_dmacachep;
- return csizep->cs_cachep;
-}
+ if (align)
+ freelist_size = ALIGN(freelist_size, align);
-static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
-{
- return __find_general_cachep(size, gfpflags);
+ return freelist_size;
}
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
+ size_t idx_size, size_t align)
{
- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+ int nr_objs;
+ size_t remained_size;
+ size_t freelist_size;
+ int extra_space = 0;
+
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ extra_space = sizeof(char);
+ /*
+ * Ignore padding for the initial guess. The padding
+ * is at most @align-1 bytes, and @buffer_size is at
+ * least @align. In the worst case, this result will
+ * be one greater than the number of objects that fit
+ * into the memory allocation when taking the padding
+ * into account.
+ */
+ nr_objs = slab_size / (buffer_size + idx_size + extra_space);
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+ remained_size = slab_size - nr_objs * buffer_size;
+ freelist_size = calculate_freelist_size(nr_objs, align);
+ if (remained_size < freelist_size)
+ nr_objs--;
+
+ return nr_objs;
}
/*
@@ -812,8 +671,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - The struct slab
- * - One kmem_bufctl_t for each object
+ * - One unsigned int for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
@@ -826,38 +684,17 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
mgmt_size = 0;
nr_objs = slab_size / buffer_size;
- if (nr_objs > SLAB_LIMIT)
- nr_objs = SLAB_LIMIT;
} else {
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = (slab_size - sizeof(struct slab)) /
- (buffer_size + sizeof(kmem_bufctl_t));
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
- > slab_size)
- nr_objs--;
-
- if (nr_objs > SLAB_LIMIT)
- nr_objs = SLAB_LIMIT;
-
- mgmt_size = slab_mgmt_size(nr_objs, align);
+ nr_objs = calculate_nr_objs(slab_size, buffer_size,
+ sizeof(freelist_idx_t), align);
+ mgmt_size = calculate_freelist_size(nr_objs, align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
-#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
+#if DEBUG
+#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
@@ -865,7 +702,36 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+}
+#endif
+
+/*
+ * By default on NUMA we use alien caches to stage the freeing of
+ * objects allocated from other nodes. This causes massive memory
+ * inefficiencies when using fake NUMA setup to split memory into a
+ * large number of small nodes, so it can be disabled on the command
+ * line
+ */
+
+static int use_alien_caches __read_mostly = 1;
+static int __init noaliencache_setup(char *s)
+{
+ use_alien_caches = 0;
+ return 1;
+}
+__setup("noaliencache", noaliencache_setup);
+
+static int __init slab_max_order_setup(char *str)
+{
+ get_option(&str, &slab_max_order);
+ slab_max_order = slab_max_order < 0 ? 0 :
+ min(slab_max_order, MAX_ORDER - 1);
+ slab_max_order_set = true;
+
+ return 1;
}
+__setup("slab_max_order=", slab_max_order_setup);
#ifdef CONFIG_NUMA
/*
@@ -874,33 +740,27 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
* objects freed on different nodes from which they were allocated) and the
* flushing of remote pcps by calling drain_node_pages.
*/
-static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
static void init_reap_node(int cpu)
{
int node;
- node = next_node(cpu_to_node(cpu), node_online_map);
+ node = next_node(cpu_to_mem(cpu), node_online_map);
if (node == MAX_NUMNODES)
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ per_cpu(slab_reap_node, cpu) = node;
}
static void next_reap_node(void)
{
- int node = __get_cpu_var(reap_node);
-
- /*
- * Also drain per cpu pages on remote zones
- */
- if (node != numa_node_id())
- drain_node_pages(node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -915,29 +775,38 @@ static void next_reap_node(void)
* the CPUs getting into lockstep and contending for the global cache chain
* lock.
*/
-static void __devinit start_cpu_timer(int cpu)
+static void start_cpu_timer(int cpu)
{
- struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+ struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
/*
* When this gets called from do_initcalls via cpucache_init(),
* init_workqueues() has already run, so keventd will be setup
* at that time.
*/
- if (keventd_up() && reap_work->func == NULL) {
+ if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_WORK(reap_work, cache_reap, NULL);
- schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
+ INIT_DEFERRABLE_WORK(reap_work, cache_reap);
+ schedule_delayed_work_on(cpu, reap_work,
+ __round_jiffies_relative(HZ, cpu));
}
}
static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount)
+ int batchcount, gfp_t gfp)
{
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;
- nc = kmalloc_node(memsize, GFP_KERNEL, node);
+ nc = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(nc);
if (nc) {
nc->avail = 0;
nc->limit = entries;
@@ -948,6 +817,122 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
+static inline bool is_slab_pfmemalloc(struct page *page)
+{
+ return PageSlabPfmemalloc(page);
+}
+
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+ struct array_cache *ac)
+{
+ struct kmem_cache_node *n = cachep->node[numa_mem_id()];
+ struct page *page;
+ unsigned long flags;
+
+ if (!pfmemalloc_active)
+ return;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->slabs_full, lru)
+ if (is_slab_pfmemalloc(page))
+ goto out;
+
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ if (is_slab_pfmemalloc(page))
+ goto out;
+
+ list_for_each_entry(page, &n->slabs_free, lru)
+ if (is_slab_pfmemalloc(page))
+ goto out;
+
+ pfmemalloc_active = false;
+out:
+ spin_unlock_irqrestore(&n->list_lock, flags);
+}
+
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ gfp_t flags, bool force_refill)
+{
+ int i;
+ void *objp = ac->entry[--ac->avail];
+
+ /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+ if (unlikely(is_obj_pfmemalloc(objp))) {
+ struct kmem_cache_node *n;
+
+ if (gfp_pfmemalloc_allowed(flags)) {
+ clear_obj_pfmemalloc(&objp);
+ return objp;
+ }
+
+ /* The caller cannot use PFMEMALLOC objects, find another one */
+ for (i = 0; i < ac->avail; i++) {
+ /* If a !PFMEMALLOC object is found, swap them */
+ if (!is_obj_pfmemalloc(ac->entry[i])) {
+ objp = ac->entry[i];
+ ac->entry[i] = ac->entry[ac->avail];
+ ac->entry[ac->avail] = objp;
+ return objp;
+ }
+ }
+
+ /*
+ * If there are empty slabs on the slabs_free list and we are
+ * being forced to refill the cache, mark this one !pfmemalloc.
+ */
+ n = cachep->node[numa_mem_id()];
+ if (!list_empty(&n->slabs_free) && force_refill) {
+ struct page *page = virt_to_head_page(objp);
+ ClearPageSlabPfmemalloc(page);
+ clear_obj_pfmemalloc(&objp);
+ recheck_pfmemalloc_active(cachep, ac);
+ return objp;
+ }
+
+ /* No !PFMEMALLOC objects available */
+ ac->avail++;
+ objp = NULL;
+ }
+
+ return objp;
+}
+
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+ struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+ void *objp;
+
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_get_obj(cachep, ac, flags, force_refill);
+ else
+ objp = ac->entry[--ac->avail];
+
+ return objp;
+}
+
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(pfmemalloc_active)) {
+ /* Some pfmemalloc slabs exist, check if this is one */
+ struct page *page = virt_to_head_page(objp);
+ if (PageSlabPfmemalloc(page))
+ set_obj_pfmemalloc(&objp);
+ }
+
+ return objp;
+}
+
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_put_obj(cachep, ac, objp);
+
+ ac->entry[ac->avail++] = objp;
+}
+
/*
* Transfer objects in one arraycache to another.
* Locking must be handled by the caller.
@@ -958,7 +943,7 @@ static int transfer_objects(struct array_cache *to,
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
- int nr = min(min(from->avail, max), to->limit - to->avail);
+ int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
@@ -968,16 +953,15 @@ static int transfer_objects(struct array_cache *to,
from->avail -= nr;
to->avail += nr;
- to->touched = 1;
return nr;
}
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
+#define reap_alien(cachep, n) do { } while (0)
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
return (struct array_cache **)BAD_ALIEN_MAGIC;
}
@@ -997,7 +981,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
return NULL;
}
-static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+static inline void *____cache_alloc_node(struct kmem_cache *cachep,
gfp_t flags, int nodeid)
{
return NULL;
@@ -1005,27 +989,25 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
#else /* CONFIG_NUMA */
-static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct array_cache **ac_ptr;
- int memsize = sizeof(void *) * MAX_NUMNODES;
+ int memsize = sizeof(void *) * nr_node_ids;
int i;
if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ ac_ptr = kzalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
- if (i == node || !node_online(i)) {
- ac_ptr[i] = NULL;
+ if (i == node || !node_online(i))
continue;
- }
- ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+ ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
- for (i--; i <= 0; i--)
+ for (i--; i >= 0; i--)
kfree(ac_ptr[i]);
kfree(ac_ptr);
return NULL;
@@ -1049,33 +1031,33 @@ static void free_alien_cache(struct array_cache **ac_ptr)
static void __drain_alien_cache(struct kmem_cache *cachep,
struct array_cache *ac, int node)
{
- struct kmem_list3 *rl3 = cachep->nodelists[node];
+ struct kmem_cache_node *n = cachep->node[node];
if (ac->avail) {
- spin_lock(&rl3->list_lock);
+ spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
* into the free lists and getting them back later.
*/
- if (rl3->shared)
- transfer_objects(rl3->shared, ac, ac->limit);
+ if (n->shared)
+ transfer_objects(n->shared, ac, ac->limit);
free_block(cachep, ac->entry, ac->avail, node);
ac->avail = 0;
- spin_unlock(&rl3->list_lock);
+ spin_unlock(&n->list_lock);
}
}
/*
* Called from cache_reap() to regularly drain alien caches round robin.
*/
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
{
- int node = __get_cpu_var(reap_node);
+ int node = __this_cpu_read(slab_reap_node);
- if (l3->alien) {
- struct array_cache *ac = l3->alien[node];
+ if (n->alien) {
+ struct array_cache *ac = n->alien[node];
if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
__drain_alien_cache(cachep, ac, node);
@@ -1103,243 +1085,408 @@ static void drain_alien_cache(struct kmem_cache *cachep,
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- struct slab *slabp = virt_to_slab(objp);
- int nodeid = slabp->nodeid;
- struct kmem_list3 *l3;
+ int nodeid = page_to_nid(virt_to_page(objp));
+ struct kmem_cache_node *n;
struct array_cache *alien = NULL;
+ int node;
+
+ node = numa_mem_id();
/*
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(slabp->nodeid == numa_node_id()))
+ if (likely(nodeid == node))
return 0;
- l3 = cachep->nodelists[numa_node_id()];
+ n = cachep->node[node];
STATS_INC_NODEFREES(cachep);
- if (l3->alien && l3->alien[nodeid]) {
- alien = l3->alien[nodeid];
+ if (n->alien && n->alien[nodeid]) {
+ alien = n->alien[nodeid];
spin_lock(&alien->lock);
if (unlikely(alien->avail == alien->limit)) {
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, alien, nodeid);
}
- alien->entry[alien->avail++] = objp;
+ ac_put_obj(cachep, alien, objp);
spin_unlock(&alien->lock);
} else {
- spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+ spin_lock(&(cachep->node[nodeid])->list_lock);
free_block(cachep, &objp, 1, nodeid);
- spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+ spin_unlock(&(cachep->node[nodeid])->list_lock);
}
return 1;
}
#endif
-static int __cpuinit cpuup_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+/*
+ * Allocates and initializes node for a node on each slab cache, used for
+ * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing node are not replaced if
+ * already in use.
+ *
+ * Must hold slab_mutex.
+ */
+static int init_cache_node_node(int node)
{
- long cpu = (long)hcpu;
struct kmem_cache *cachep;
- struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- int memsize = sizeof(struct kmem_list3);
+ struct kmem_cache_node *n;
+ const int memsize = sizeof(struct kmem_cache_node);
- switch (action) {
- case CPU_UP_PREPARE:
- mutex_lock(&cache_chain_mutex);
+ list_for_each_entry(cachep, &slab_caches, list) {
/*
- * We need to do this right in the beginning since
- * alloc_arraycache's are going to use this list.
- * kmalloc_node allows us to add the slab to the right
- * kmem_list3 and not this cpu's kmem_list3
+ * Set up the kmem_cache_node for cpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
*/
+ if (!cachep->node[node]) {
+ n = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (!n)
+ return -ENOMEM;
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
- list_for_each_entry(cachep, &cache_chain, next) {
/*
- * Set up the size64 kmemlist for cpu before we can
- * begin anything. Make sure some other cpu on this
- * node has not already allocated this
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
+ * protection here.
*/
- if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
- if (!l3)
- goto bad;
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- /*
- * The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
- * protection here.
- */
- cachep->nodelists[node] = l3;
- }
+ cachep->node[node] = n;
+ }
+
+ spin_lock_irq(&cachep->node[node]->list_lock);
+ cachep->node[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->node[node]->list_lock);
+ }
+ return 0;
+}
- spin_lock_irq(&cachep->nodelists[node]->list_lock);
- cachep->nodelists[node]->free_limit =
- (1 + nr_cpus_node(node)) *
- cachep->batchcount + cachep->num;
- spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+static inline int slabs_tofree(struct kmem_cache *cachep,
+ struct kmem_cache_node *n)
+{
+ return (n->free_objects + cachep->num - 1) / cachep->num;
+}
+
+static void cpuup_canceled(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_cache_node *n = NULL;
+ int node = cpu_to_mem(cpu);
+ const struct cpumask *mask = cpumask_of_node(node);
+
+ list_for_each_entry(cachep, &slab_caches, list) {
+ struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
+
+ /* cpu is dead; no one can alloc from it. */
+ nc = cachep->array[cpu];
+ cachep->array[cpu] = NULL;
+ n = cachep->node[node];
+
+ if (!n)
+ goto free_array_cache;
+
+ spin_lock_irq(&n->list_lock);
+
+ /* Free limit for this kmem_cache_node */
+ n->free_limit -= cachep->batchcount;
+ if (nc)
+ free_block(cachep, nc->entry, nc->avail, node);
+
+ if (!cpumask_empty(mask)) {
+ spin_unlock_irq(&n->list_lock);
+ goto free_array_cache;
}
- /*
- * Now we can go ahead with allocating the shared arrays and
- * array caches
- */
- list_for_each_entry(cachep, &cache_chain, next) {
- struct array_cache *nc;
- struct array_cache *shared;
- struct array_cache **alien;
-
- nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
- if (!nc)
- goto bad;
+ shared = n->shared;
+ if (shared) {
+ free_block(cachep, shared->entry,
+ shared->avail, node);
+ n->shared = NULL;
+ }
+
+ alien = n->alien;
+ n->alien = NULL;
+
+ spin_unlock_irq(&n->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
+ }
+free_array_cache:
+ kfree(nc);
+ }
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &slab_caches, list) {
+ n = cachep->node[node];
+ if (!n)
+ continue;
+ drain_freelist(cachep, n, slabs_tofree(cachep, n));
+ }
+}
+
+static int cpuup_prepare(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_cache_node *n = NULL;
+ int node = cpu_to_mem(cpu);
+ int err;
+
+ /*
+ * We need to do this right in the beginning since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_cache_node and not this cpu's kmem_cache_node
+ */
+ err = init_cache_node_node(node);
+ if (err < 0)
+ goto bad;
+
+ /*
+ * Now we can go ahead with allocating the shared arrays and
+ * array caches
+ */
+ list_for_each_entry(cachep, &slab_caches, list) {
+ struct array_cache *nc;
+ struct array_cache *shared = NULL;
+ struct array_cache **alien = NULL;
+
+ nc = alloc_arraycache(node, cachep->limit,
+ cachep->batchcount, GFP_KERNEL);
+ if (!nc)
+ goto bad;
+ if (cachep->shared) {
shared = alloc_arraycache(node,
- cachep->shared * cachep->batchcount,
- 0xbaadf00d);
- if (!shared)
+ cachep->shared * cachep->batchcount,
+ 0xbaadf00d, GFP_KERNEL);
+ if (!shared) {
+ kfree(nc);
goto bad;
-
- alien = alloc_alien_cache(node, cachep->limit);
- if (!alien)
+ }
+ }
+ if (use_alien_caches) {
+ alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
+ if (!alien) {
+ kfree(shared);
+ kfree(nc);
goto bad;
- cachep->array[cpu] = nc;
- l3 = cachep->nodelists[node];
- BUG_ON(!l3);
-
- spin_lock_irq(&l3->list_lock);
- if (!l3->shared) {
- /*
- * We are serialised from CPU_DEAD or
- * CPU_UP_CANCELLED by the cpucontrol lock
- */
- l3->shared = shared;
- shared = NULL;
}
+ }
+ cachep->array[cpu] = nc;
+ n = cachep->node[node];
+ BUG_ON(!n);
+
+ spin_lock_irq(&n->list_lock);
+ if (!n->shared) {
+ /*
+ * We are serialised from CPU_DEAD or
+ * CPU_UP_CANCELLED by the cpucontrol lock
+ */
+ n->shared = shared;
+ shared = NULL;
+ }
#ifdef CONFIG_NUMA
- if (!l3->alien) {
- l3->alien = alien;
- alien = NULL;
- }
-#endif
- spin_unlock_irq(&l3->list_lock);
- kfree(shared);
- free_alien_cache(alien);
+ if (!n->alien) {
+ n->alien = alien;
+ alien = NULL;
}
- mutex_unlock(&cache_chain_mutex);
+#endif
+ spin_unlock_irq(&n->list_lock);
+ kfree(shared);
+ free_alien_cache(alien);
+ if (cachep->flags & SLAB_DEBUG_OBJECTS)
+ slab_set_debugobj_lock_classes_node(cachep, node);
+ else if (!OFF_SLAB(cachep) &&
+ !(cachep->flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes_node(cachep, node);
+ }
+ init_node_lock_keys(node);
+
+ return 0;
+bad:
+ cpuup_canceled(cpu);
+ return -ENOMEM;
+}
+
+static int cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ int err = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ mutex_lock(&slab_mutex);
+ err = cpuup_prepare(cpu);
+ mutex_unlock(&slab_mutex);
break;
case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ /*
+ * Shutdown cache reaper. Note that the slab_mutex is
+ * held so that if cache_reap() is invoked it cannot do
+ * anything expensive but will only modify reap_work
+ * and reschedule the timer.
+ */
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
+ /* Now the cache_reaper is guaranteed to be not running. */
+ per_cpu(slab_reap_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
/*
* Even if all the cpus of a node are down, we don't free the
- * kmem_list3 of any cache. This to avoid a race between
+ * kmem_cache_node of any cache. This to avoid a race between
* cpu_down, and a kmalloc allocation from another cpu for
- * memory from the node of the cpu going down. The list3
+ * memory from the node of the cpu going down. The node
* structure is usually allocated from kmem_cache_create() and
* gets destroyed at kmem_cache_destroy().
*/
- /* fall thru */
+ /* fall through */
+#endif
case CPU_UP_CANCELED:
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next) {
- struct array_cache *nc;
- struct array_cache *shared;
- struct array_cache **alien;
- cpumask_t mask;
-
- mask = node_to_cpumask(node);
- /* cpu is dead; no one can alloc from it. */
- nc = cachep->array[cpu];
- cachep->array[cpu] = NULL;
- l3 = cachep->nodelists[node];
-
- if (!l3)
- goto free_array_cache;
-
- spin_lock_irq(&l3->list_lock);
-
- /* Free limit for this kmem_list3 */
- l3->free_limit -= cachep->batchcount;
- if (nc)
- free_block(cachep, nc->entry, nc->avail, node);
-
- if (!cpus_empty(mask)) {
- spin_unlock_irq(&l3->list_lock);
- goto free_array_cache;
- }
+ case CPU_UP_CANCELED_FROZEN:
+ mutex_lock(&slab_mutex);
+ cpuup_canceled(cpu);
+ mutex_unlock(&slab_mutex);
+ break;
+ }
+ return notifier_from_errno(err);
+}
- shared = l3->shared;
- if (shared) {
- free_block(cachep, l3->shared->entry,
- l3->shared->avail, node);
- l3->shared = NULL;
- }
+static struct notifier_block cpucache_notifier = {
+ &cpuup_callback, NULL, 0
+};
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold slab_mutex.
+ */
+static int __meminit drain_cache_node_node(int node)
+{
+ struct kmem_cache *cachep;
+ int ret = 0;
- alien = l3->alien;
- l3->alien = NULL;
+ list_for_each_entry(cachep, &slab_caches, list) {
+ struct kmem_cache_node *n;
- spin_unlock_irq(&l3->list_lock);
+ n = cachep->node[node];
+ if (!n)
+ continue;
- kfree(shared);
- if (alien) {
- drain_alien_cache(cachep, alien);
- free_alien_cache(alien);
- }
-free_array_cache:
- kfree(nc);
- }
- /*
- * In the previous loop, all the objects were freed to
- * the respective cache's slabs, now we can go ahead and
- * shrink each nodelist to its limit.
- */
- list_for_each_entry(cachep, &cache_chain, next) {
- l3 = cachep->nodelists[node];
- if (!l3)
- continue;
- drain_freelist(cachep, l3, l3->free_objects);
+ drain_freelist(cachep, n, slabs_tofree(cachep, n));
+
+ if (!list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial)) {
+ ret = -EBUSY;
+ break;
}
- mutex_unlock(&cache_chain_mutex);
- break;
-#endif
}
- return NOTIFY_OK;
-bad:
- mutex_unlock(&cache_chain_mutex);
- return NOTIFY_BAD;
+ return ret;
}
-static struct notifier_block __cpuinitdata cpucache_notifier = {
- &cpuup_callback, NULL, 0
-};
+static int __meminit slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mnb = arg;
+ int ret = 0;
+ int nid;
+
+ nid = mnb->status_change_nid;
+ if (nid < 0)
+ goto out;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ mutex_lock(&slab_mutex);
+ ret = init_cache_node_node(nid);
+ mutex_unlock(&slab_mutex);
+ break;
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&slab_mutex);
+ ret = drain_cache_node_node(nid);
+ mutex_unlock(&slab_mutex);
+ break;
+ case MEM_ONLINE:
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+out:
+ return notifier_from_errno(ret);
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
/*
- * swap the static kmem_list3 with kmalloced memory
+ * swap the static kmem_cache_node with kmalloced memory
*/
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
- int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
+ int nodeid)
{
- struct kmem_list3 *ptr;
+ struct kmem_cache_node *ptr;
- BUG_ON(cachep->nodelists[nodeid] != list);
- ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
BUG_ON(!ptr);
- local_irq_disable();
- memcpy(ptr, list, sizeof(struct kmem_list3));
+ memcpy(ptr, list, sizeof(struct kmem_cache_node));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
- cachep->nodelists[nodeid] = ptr;
- local_irq_enable();
+ cachep->node[nodeid] = ptr;
+}
+
+/*
+ * For setting up all the kmem_cache_node for cache whose buffer_size is same as
+ * size of kmem_cache_node.
+ */
+static void __init set_up_node(struct kmem_cache *cachep, int index)
+{
+ int node;
+
+ for_each_online_node(node) {
+ cachep->node[node] = &init_kmem_cache_node[index + node];
+ cachep->node[node]->next_reap = jiffies +
+ REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ }
+}
+
+/*
+ * The memory after the last cpu cache pointer is used for the
+ * the node pointer.
+ */
+static void setup_node_pointer(struct kmem_cache *cachep)
+{
+ cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
}
/*
@@ -1348,188 +1495,144 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
*/
void __init kmem_cache_init(void)
{
- size_t left_over;
- struct cache_sizes *sizes;
- struct cache_names *names;
int i;
- int order;
- for (i = 0; i < NUM_INIT_LISTS; i++) {
- kmem_list3_init(&initkmem_list3[i]);
- if (i < MAX_NUMNODES)
- cache_cache.nodelists[i] = NULL;
- }
+ BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
+ sizeof(struct rcu_head));
+ kmem_cache = &kmem_cache_boot;
+ setup_node_pointer(kmem_cache);
+
+ if (num_possible_nodes() == 1)
+ use_alien_caches = 0;
+
+ for (i = 0; i < NUM_INIT_LISTS; i++)
+ kmem_cache_node_init(&init_kmem_cache_node[i]);
+
+ set_up_node(kmem_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
- * page orders on machines with more than 32MB of memory.
+ * page orders on machines with more than 32MB of memory if
+ * not overridden on the command line.
*/
- if (num_physpages > (32 << 20) >> PAGE_SHIFT)
- slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+ if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+ slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
- * 1) initialize the cache_cache cache: it contains the struct
- * kmem_cache structures of all caches, except cache_cache itself:
- * cache_cache is statically allocated.
+ * 1) initialize the kmem_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except kmem_cache itself:
+ * kmem_cache is statically allocated.
* Initially an __init data area is used for the head array and the
- * kmem_list3 structures, it's replaced with a kmalloc allocated
+ * kmem_cache_node structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
* The struct kmem_cache for the new cache is allocated normally.
* An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.
- * 4) Replace the __init data head arrays for cache_cache and the first
+ * 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * 5) Replace the __init data for kmem_cache_node for kmem_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- /* 1) create the cache_cache */
- INIT_LIST_HEAD(&cache_chain);
- list_add(&cache_cache.next, &cache_chain);
- cache_cache.colour_off = cache_line_size();
- cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
+ /* 1) create the kmem_cache */
- cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
- cache_line_size());
-
- for (order = 0; order < MAX_ORDER; order++) {
- cache_estimate(order, cache_cache.buffer_size,
- cache_line_size(), 0, &left_over, &cache_cache.num);
- if (cache_cache.num)
- break;
- }
- BUG_ON(!cache_cache.num);
- cache_cache.gfporder = order;
- cache_cache.colour = left_over / cache_cache.colour_off;
- cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
- sizeof(struct slab), cache_line_size());
+ /*
+ * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
+ */
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN);
+ list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
- sizes = malloc_sizes;
- names = cache_names;
/*
* Initialize the caches that provide memory for the array cache and the
- * kmem_list3 structures first. Without this, further allocations will
+ * kmem_cache_node structures first. Without this, further allocations will
* bug.
*/
- sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL, NULL);
-
- if (INDEX_AC != INDEX_L3) {
- sizes[INDEX_L3].cs_cachep =
- kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL, NULL);
- }
+ kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
+ kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
- slab_early_init = 0;
+ if (INDEX_AC != INDEX_NODE)
+ kmalloc_caches[INDEX_NODE] =
+ create_kmalloc_cache("kmalloc-node",
+ kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
- while (sizes->cs_size != ULONG_MAX) {
- /*
- * For performance, all the general caches are L1 aligned.
- * This should be particularly beneficial on SMP boxes, as it
- * eliminates "false sharing".
- * Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches.
- */
- if (!sizes->cs_cachep) {
- sizes->cs_cachep = kmem_cache_create(names->name,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL, NULL);
- }
+ slab_early_init = 0;
- sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
- SLAB_PANIC,
- NULL, NULL);
- sizes++;
- names++;
- }
/* 4) Replace the bootstrap head arrays */
{
struct array_cache *ptr;
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- local_irq_disable();
- BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, cpu_cache_get(&cache_cache),
+ memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
- cache_cache.array[smp_processor_id()] = ptr;
- local_irq_enable();
+ kmem_cache->array[smp_processor_id()] = ptr;
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- local_irq_disable();
- BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
+ BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
!= &initarray_generic.cache);
- memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
+ memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
- malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
- ptr;
- local_irq_enable();
+ kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
}
- /* 5) Replace the bootstrap kmem_list3's */
+ /* 5) Replace the bootstrap kmem_cache_node */
{
- int node;
- /* Replace the static kmem_list3 structures for the boot cpu */
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
- numa_node_id());
-
- for_each_online_node(node) {
- init_list(malloc_sizes[INDEX_AC].cs_cachep,
- &initkmem_list3[SIZE_AC + node], node);
-
- if (INDEX_AC != INDEX_L3) {
- init_list(malloc_sizes[INDEX_L3].cs_cachep,
- &initkmem_list3[SIZE_L3 + node],
- node);
+ int nid;
+
+ for_each_online_node(nid) {
+ init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
+
+ init_list(kmalloc_caches[INDEX_AC],
+ &init_kmem_cache_node[SIZE_AC + nid], nid);
+
+ if (INDEX_AC != INDEX_NODE) {
+ init_list(kmalloc_caches[INDEX_NODE],
+ &init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
}
+ create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
+}
+
+void __init kmem_cache_init_late(void)
+{
+ struct kmem_cache *cachep;
+
+ slab_state = UP;
+
/* 6) resize the head arrays to their final sizes */
- {
- struct kmem_cache *cachep;
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep))
- BUG();
- mutex_unlock(&cache_chain_mutex);
- }
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(cachep, &slab_caches, list)
+ if (enable_cpucache(cachep, GFP_NOWAIT))
+ BUG();
+ mutex_unlock(&slab_mutex);
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
-
/* Done! */
- g_cpucache_up = FULL;
+ slab_state = FULL;
/*
* Register a cpu startup notifier callback that initializes
@@ -1537,6 +1640,14 @@ void __init kmem_cache_init(void)
*/
register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+ /*
+ * Register a memory hotplug callback that initializes and frees
+ * node.
+ */
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
@@ -1552,10 +1663,66 @@ static int __init cpucache_init(void)
*/
for_each_online_cpu(cpu)
start_cpu_timer(cpu);
+
+ /* Done! */
+ slab_state = FULL;
return 0;
}
__initcall(cpucache_init);
+static noinline void
+slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
+{
+#if DEBUG
+ struct kmem_cache_node *n;
+ struct page *page;
+ unsigned long flags;
+ int node;
+ static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
+ return;
+
+ printk(KERN_WARNING
+ "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+ nodeid, gfpflags);
+ printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ cachep->name, cachep->size, cachep->gfporder);
+
+ for_each_online_node(node) {
+ unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
+ unsigned long active_slabs = 0, num_slabs = 0;
+
+ n = cachep->node[node];
+ if (!n)
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->slabs_full, lru) {
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ active_objs += page->active;
+ active_slabs++;
+ }
+ list_for_each_entry(page, &n->slabs_free, lru)
+ num_slabs++;
+
+ free_objects += n->free_objects;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ num_slabs += active_slabs;
+ num_objs = num_slabs * cachep->num;
+ printk(KERN_WARNING
+ " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ node, active_slabs, num_slabs, active_objs, num_objs,
+ free_objects);
+ }
+#endif
+}
+
/*
* Interface to system's page allocator. No need to hold the cache-lock.
*
@@ -1563,30 +1730,29 @@ __initcall(cpucache_init);
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid)
{
struct page *page;
int nr_pages;
- int i;
-#ifndef CONFIG_MMU
- /*
- * Nommu uses slab's for process anonymous memory allocations, and thus
- * requires __GFP_COMP to properly refcount higher order allocations
- */
- flags |= __GFP_COMP;
-#endif
+ flags |= cachep->allocflags;
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ flags |= __GFP_RECLAIMABLE;
- /*
- * Under NUMA we want memory on the indicated node. We will handle
- * the needed fallback ourselves since we want to serve from our
- * per node object lists first for other nodes.
- */
- flags |= cachep->gfpflags | GFP_THISNODE;
+ if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+ return NULL;
- page = alloc_pages_node(nodeid, flags, cachep->gfporder);
- if (!page)
+ page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+ if (!page) {
+ memcg_uncharge_slab(cachep, cachep->gfporder);
+ slab_out_of_memory(cachep, flags, nodeid);
return NULL;
+ }
+
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (unlikely(page->pfmemalloc))
+ pfmemalloc_active = true;
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1595,19 +1761,30 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
- for (i = 0; i < nr_pages; i++)
- __SetPageSlab(page + i);
- return page_address(page);
+ __SetPageSlab(page);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page);
+
+ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
+ kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
+
+ if (cachep->ctor)
+ kmemcheck_mark_uninitialized_pages(page, nr_pages);
+ else
+ kmemcheck_mark_unallocated_pages(page, nr_pages);
+ }
+
+ return page;
}
/*
* Interface to system's page release.
*/
-static void kmem_freepages(struct kmem_cache *cachep, void *addr)
+static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- unsigned long i = (1 << cachep->gfporder);
- struct page *page = virt_to_page(addr);
- const unsigned long nr_freed = i;
+ const unsigned long nr_freed = (1 << cachep->gfporder);
+
+ kmemcheck_free_shadow(page, cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1615,24 +1792,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
else
sub_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_freed);
- while (i--) {
- BUG_ON(!PageSlab(page));
- __ClearPageSlab(page);
- page++;
- }
+
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+ page_mapcount_reset(page);
+ page->mapping = NULL;
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- free_pages((unsigned long)addr, cachep->gfporder);
+ __free_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(cachep, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
{
- struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
- struct kmem_cache *cachep = slab_rcu->cachep;
+ struct kmem_cache *cachep;
+ struct page *page;
- kmem_freepages(cachep, slab_rcu->addr);
- if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->slabp_cache, slab_rcu);
+ page = container_of(head, struct page, rcu_head);
+ cachep = page->slab_cache;
+
+ kmem_freepages(cachep, page);
}
#if DEBUG
@@ -1641,7 +1822,7 @@ static void kmem_rcu_free(struct rcu_head *head)
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
unsigned long caller)
{
- int size = obj_size(cachep);
+ int size = cachep->object_size;
addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
@@ -1673,7 +1854,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
{
- int size = obj_size(cachep);
+ int size = cachep->object_size;
addr = &((char *)addr)[obj_offset(cachep)];
memset(addr, val, size);
@@ -1683,10 +1864,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
static void dump_line(char *data, int offset, int limit)
{
int i;
- printk(KERN_ERR "%03x:", offset);
- for (i = 0; i < limit; i++)
- printk(" %02x", (unsigned char)data[offset + i]);
- printk("\n");
+ unsigned char error = 0;
+ int bad_count = 0;
+
+ printk(KERN_ERR "%03x: ", offset);
+ for (i = 0; i < limit; i++) {
+ if (data[offset + i] != POISON_FREE) {
+ error = data[offset + i];
+ bad_count++;
+ }
+ }
+ print_hex_dump(KERN_CONT, "", 0, 16, 1,
+ &data[offset], limit, 1);
+
+ if (bad_count == 1) {
+ error ^= POISON_FREE;
+ if (!(error & (error - 1))) {
+ printk(KERN_ERR "Single bit error detected. Probably "
+ "bad RAM.\n");
+#ifdef CONFIG_X86
+ printk(KERN_ERR "Run memtest86+ or a similar memory "
+ "test tool.\n");
+#else
+ printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+ }
+ }
}
#endif
@@ -1698,20 +1901,18 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
char *realobj;
if (cachep->flags & SLAB_RED_ZONE) {
- printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
+ printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
*dbg_redzone1(cachep, objp),
*dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>]",
- *dbg_userword(cachep, objp));
- print_symbol("(%s)",
- (unsigned long)*dbg_userword(cachep, objp));
- printk("\n");
+ printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
+ *dbg_userword(cachep, objp),
+ *dbg_userword(cachep, objp));
}
realobj = (char *)objp + obj_offset(cachep);
- size = obj_size(cachep);
+ size = cachep->object_size;
for (i = 0; i < size && lines; i += 16, lines--) {
int limit;
limit = 16;
@@ -1728,7 +1929,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int lines = 0;
realobj = (char *)objp + obj_offset(cachep);
- size = obj_size(cachep);
+ size = cachep->object_size;
for (i = 0; i < size; i++) {
char exp = POISON_FREE;
@@ -1740,8 +1941,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print header */
if (lines == 0) {
printk(KERN_ERR
- "Slab corruption: start=%p, len=%d\n",
- realobj, size);
+ "Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name, realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1761,19 +1962,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct slab *slabp = virt_to_slab(objp);
+ struct page *page = virt_to_head_page(objp);
unsigned int objnr;
- objnr = obj_to_index(cachep, slabp, objp);
+ objnr = obj_to_index(cachep, page, objp);
if (objnr) {
- objp = index_to_obj(cachep, slabp, objnr - 1);
+ objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = index_to_obj(cachep, slabp, objnr + 1);
+ objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Next obj: start=%p, len=%d\n",
realobj, size);
@@ -1784,26 +1985,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#endif
#if DEBUG
-/**
- * slab_destroy_objs - destroy a slab and its objects
- * @cachep: cache pointer being destroyed
- * @slabp: slab pointer being destroyed
- *
- * Call the registered destructor for each object in a slab that is being
- * destroyed.
- */
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+ struct page *page)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, slabp, i);
+ void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->buffer_size % PAGE_SIZE == 0 &&
+ if (cachep->size % PAGE_SIZE == 0 &&
OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 1);
+ cachep->size / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -1818,88 +2012,54 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
slab_error(cachep, "end of a freed object "
"was overwritten");
}
- if (cachep->dtor && !(cachep->flags & SLAB_POISON))
- (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
}
}
#else
-static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep,
+ struct page *page)
{
- if (cachep->dtor) {
- int i;
- for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, slabp, i);
- (cachep->dtor) (objp, cachep, 0);
- }
- }
}
#endif
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
- * @slabp: slab pointer being destroyed
+ * @page: page pointer being destroyed
*
* Destroy all the objs in a slab, and release the mem back to the system.
* Before calling the slab must have been unlinked from the cache. The
* cache-lock is not held/needed.
*/
-static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
+static void slab_destroy(struct kmem_cache *cachep, struct page *page)
{
- void *addr = slabp->s_mem - slabp->colouroff;
+ void *freelist;
- slab_destroy_objs(cachep, slabp);
+ freelist = page->freelist;
+ slab_destroy_debugcheck(cachep, page);
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
- struct slab_rcu *slab_rcu;
-
- slab_rcu = (struct slab_rcu *)slabp;
- slab_rcu->cachep = cachep;
- slab_rcu->addr = addr;
- call_rcu(&slab_rcu->head, kmem_rcu_free);
- } else {
- kmem_freepages(cachep, addr);
- if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->slabp_cache, slabp);
- }
-}
+ struct rcu_head *head;
-/*
- * For setting up all the kmem_list3s for cache whose buffer_size is same as
- * size of kmem_list3.
- */
-static void set_up_list3s(struct kmem_cache *cachep, int index)
-{
- int node;
+ /*
+ * RCU free overloads the RCU head over the LRU.
+ * slab_page has been overloeaded over the LRU,
+ * however it is not used from now on so that
+ * we can use it safely.
+ */
+ head = (void *)&page->rcu_head;
+ call_rcu(head, kmem_rcu_free);
- for_each_online_node(node) {
- cachep->nodelists[node] = &initkmem_list3[index + node];
- cachep->nodelists[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ } else {
+ kmem_freepages(cachep, page);
}
-}
-
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
- int i;
- struct kmem_list3 *l3;
-
- for_each_online_cpu(i)
- kfree(cachep->array[i]);
- /* NUMA: free the list3 structures */
- for_each_online_node(i) {
- l3 = cachep->nodelists[i];
- if (l3) {
- kfree(l3->shared);
- free_alien_cache(l3->alien);
- kfree(l3);
- }
- }
- kmem_cache_free(&cache_cache, cachep);
+ /*
+ * From now on, we don't use freelist
+ * although actual page can be freed in rcu context
+ */
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->freelist_cache, freelist);
}
-
/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
@@ -1920,7 +2080,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t left_over = 0;
int gfporder;
- for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
+ for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
@@ -1928,14 +2088,21 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (!num)
continue;
+ /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
+ if (num > SLAB_OBJ_MAX_NUM)
+ break;
+
if (flags & CFLGS_OFF_SLAB) {
+ size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
- offslab_limit = size - sizeof(struct slab);
- offslab_limit /= sizeof(kmem_bufctl_t);
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ freelist_size_per_obj += sizeof(char);
+ offslab_limit = size;
+ offslab_limit /= freelist_size_per_obj;
if (num > offslab_limit)
break;
@@ -1958,7 +2125,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
- if (gfporder >= slab_break_gfp_order)
+ if (gfporder >= slab_max_order)
break;
/*
@@ -1970,50 +2137,59 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}
-static int setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
- if (g_cpucache_up == FULL)
- return enable_cpucache(cachep);
+ if (slab_state >= FULL)
+ return enable_cpucache(cachep, gfp);
- if (g_cpucache_up == NONE) {
+ if (slab_state == DOWN) {
+ /*
+ * Note: Creation of first cache (kmem_cache).
+ * The setup_node is taken care
+ * of by the caller of __kmem_cache_create
+ */
+ cachep->array[smp_processor_id()] = &initarray_generic.cache;
+ slab_state = PARTIAL;
+ } else if (slab_state == PARTIAL) {
/*
- * Note: the first kmem_cache_create must create the cache
+ * Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
- * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
- * the first cache, then we need to set up all its list3s,
+ * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
+ * the second cache, then we need to set up all its node/,
* otherwise the creation of further caches will BUG().
*/
- set_up_list3s(cachep, SIZE_AC);
- if (INDEX_AC == INDEX_L3)
- g_cpucache_up = PARTIAL_L3;
+ set_up_node(cachep, SIZE_AC);
+ if (INDEX_AC == INDEX_NODE)
+ slab_state = PARTIAL_NODE;
else
- g_cpucache_up = PARTIAL_AC;
+ slab_state = PARTIAL_ARRAYCACHE;
} else {
+ /* Remaining boot caches */
cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ kmalloc(sizeof(struct arraycache_init), gfp);
- if (g_cpucache_up == PARTIAL_AC) {
- set_up_list3s(cachep, SIZE_L3);
- g_cpucache_up = PARTIAL_L3;
+ if (slab_state == PARTIAL_ARRAYCACHE) {
+ set_up_node(cachep, SIZE_NODE);
+ slab_state = PARTIAL_NODE;
} else {
int node;
for_each_online_node(node) {
- cachep->nodelists[node] =
- kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node);
- BUG_ON(!cachep->nodelists[node]);
- kmem_list3_init(cachep->nodelists[node]);
+ cachep->node[node] =
+ kmalloc_node(sizeof(struct kmem_cache_node),
+ gfp, node);
+ BUG_ON(!cachep->node[node]);
+ kmem_cache_node_init(cachep->node[node]);
}
}
}
- cachep->nodelists[numa_node_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ cachep->node[numa_mem_id()]->next_reap =
+ jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -2025,21 +2201,13 @@ static int setup_cpu_cache(struct kmem_cache *cachep)
}
/**
- * kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
+ * __kmem_cache_create - Create a cache.
+ * @cachep: cache management descriptor
* @flags: SLAB flags
- * @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache
- * and the @dtor is run before the pages are handed back.
- *
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
+ * The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
@@ -2053,67 +2221,15 @@ static int setup_cpu_cache(struct kmem_cache *cachep)
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-struct kmem_cache *
-kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags,
- void (*ctor)(void*, struct kmem_cache *, unsigned long),
- void (*dtor)(void*, struct kmem_cache *, unsigned long))
+int
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL, *pc;
-
- /*
- * Sanity checks... these are all serious usage bugs.
- */
- if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
- (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
- printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
- name);
- BUG();
- }
-
- /*
- * Prevent CPUs from coming and going.
- * lock_cpu_hotplug() nests outside cache_chain_mutex
- */
- lock_cpu_hotplug();
-
- mutex_lock(&cache_chain_mutex);
-
- list_for_each_entry(pc, &cache_chain, next) {
- mm_segment_t old_fs = get_fs();
- char tmp;
- int res;
-
- /*
- * This happens when the module gets unloaded and doesn't
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- set_fs(KERNEL_DS);
- res = __get_user(tmp, pc->name);
- set_fs(old_fs);
- if (res) {
- printk("SLAB: cache with size %d has lost its name\n",
- pc->buffer_size);
- continue;
- }
-
- if (!strcmp(pc->name, name)) {
- printk("kmem_cache_create: duplicate cache %s\n", name);
- dump_stack();
- goto oops;
- }
- }
+ size_t left_over, freelist_size, ralign;
+ gfp_t gfp;
+ int err;
+ size_t size = cachep->size;
#if DEBUG
- WARN_ON(strchr(name, ' ')); /* It confuses parsers */
- if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
- /* No constructor, but inital state check requested */
- printk(KERN_ERR "%s: No con, but init state check "
- "requested - %s\n", __FUNCTION__, name);
- flags &= ~SLAB_DEBUG_INITIAL;
- }
#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
@@ -2121,7 +2237,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
- if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
+ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
+ 2 * sizeof(unsigned long long)))
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
@@ -2129,14 +2246,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(dtor);
-
- /*
- * Always checks flags, a caller might be expecting debug support which
- * isn't available.
- */
- BUG_ON(flags & ~CREATE_MASK);
/*
* Check that size is in terms of words. This is needed to avoid
@@ -2148,54 +2257,41 @@ kmem_cache_create (const char *name, size_t size, size_t align,
size &= ~(BYTES_PER_WORD - 1);
}
- /* calculate the final buffer alignment: */
-
- /* 1) arch recommendation: can be overridden for debug */
- if (flags & SLAB_HWCACHE_ALIGN) {
- /*
- * Default alignment: as specified by the arch code. Except if
- * an object is really small, then squeeze multiple objects into
- * one cacheline.
- */
- ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- } else {
- ralign = BYTES_PER_WORD;
- }
-
/*
- * Redzoning and user store require word alignment. Note this will be
- * overridden by architecture or caller mandated alignment if either
- * is greater than BYTES_PER_WORD.
+ * Redzoning and user store require word alignment or possibly larger.
+ * Note this will be overridden by architecture or caller mandated
+ * alignment if either is greater than BYTES_PER_WORD.
*/
- if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
+ if (flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
- /* 2) arch mandated alignment: disables debug if necessary */
- if (ralign < ARCH_SLAB_MINALIGN) {
- ralign = ARCH_SLAB_MINALIGN;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+ if (flags & SLAB_RED_ZONE) {
+ ralign = REDZONE_ALIGN;
+ /* If redzoning, ensure that the second redzone is suitably
+ * aligned, by adjusting the object size accordingly. */
+ size += REDZONE_ALIGN - 1;
+ size &= ~(REDZONE_ALIGN - 1);
}
- /* 3) caller mandated alignment: disables debug if necessary */
- if (ralign < align) {
- ralign = align;
- if (ralign > BYTES_PER_WORD)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+
+ /* 3) caller mandated alignment */
+ if (ralign < cachep->align) {
+ ralign = cachep->align;
}
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
- align = ralign;
+ cachep->align = ralign;
- /* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
- if (!cachep)
- goto oops;
+ if (slab_is_available())
+ gfp = GFP_KERNEL;
+ else
+ gfp = GFP_NOWAIT;
+ setup_node_pointer(cachep);
#if DEBUG
- cachep->obj_size = size;
/*
* Both debugging options require word-alignment which is calculated
@@ -2203,19 +2299,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += BYTES_PER_WORD;
- size += 2 * BYTES_PER_WORD;
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
- * the real object.
+ * the real object. But if the second red zone needs to be
+ * aligned to 64 bits, we must allow that much space.
*/
- size += BYTES_PER_WORD;
+ if (flags & SLAB_RED_ZONE)
+ size += REDZONE_ALIGN;
+ else
+ size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ if (size >= kmalloc_size(INDEX_NODE + 1)
+ && cachep->object_size > cache_line_size()
+ && ALIGN(size, cachep->align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
size = PAGE_SIZE;
}
#endif
@@ -2224,87 +2325,99 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
- * it too early on.)
+ * it too early on. Always use on-slab management when
+ * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+ if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
+ !(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
- size = ALIGN(size, align);
+ size = ALIGN(size, cachep->align);
+ /*
+ * We should restrict the number of objects in a slab to implement
+ * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+ */
+ if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+ size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, align, flags);
+ left_over = calculate_slab_order(cachep, size, cachep->align, flags);
- if (!cachep->num) {
- printk("kmem_cache_create: couldn't create cache %s.\n", name);
- kmem_cache_free(&cache_cache, cachep);
- cachep = NULL;
- goto oops;
- }
- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+ if (!cachep->num)
+ return -E2BIG;
+
+ freelist_size = calculate_freelist_size(cachep->num, cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
+ if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
- left_over -= slab_size;
+ left_over -= freelist_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- slab_size =
- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+ freelist_size = calculate_freelist_size(cachep->num, 0);
+
+#ifdef CONFIG_PAGE_POISONING
+ /* If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
+ flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
}
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < align)
- cachep->colour_off = align;
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
- cachep->slab_size = slab_size;
+ cachep->freelist_size = freelist_size;
cachep->flags = flags;
- cachep->gfpflags = 0;
- if (flags & SLAB_CACHE_DMA)
- cachep->gfpflags |= GFP_DMA;
- cachep->buffer_size = size;
+ cachep->allocflags = __GFP_COMP;
+ if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+ cachep->allocflags |= GFP_DMA;
+ cachep->size = size;
+ cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
- cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
+ cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
- * This is a possibility for one of the malloc_sizes caches.
+ * This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
- * this should not happen at all.
+ * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+ * in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
- BUG_ON(!cachep->slabp_cache);
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
}
- cachep->ctor = ctor;
- cachep->dtor = dtor;
- cachep->name = name;
-
- if (setup_cpu_cache(cachep)) {
- __kmem_cache_destroy(cachep);
- cachep = NULL;
- goto oops;
+
+ err = setup_cpu_cache(cachep, gfp);
+ if (err) {
+ __kmem_cache_shutdown(cachep);
+ return err;
}
- /* cache setup completed, link it into the list */
- list_add(&cachep->next, &cache_chain);
-oops:
- if (!cachep && (flags & SLAB_PANIC))
- panic("kmem_cache_create(): failed to create slab `%s'\n",
- name);
- mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
- return cachep;
+ if (flags & SLAB_DEBUG_OBJECTS) {
+ /*
+ * Would deadlock through slab_destroy()->call_rcu()->
+ * debug_object_activate()->kmem_cache_alloc().
+ */
+ WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
+
+ slab_set_debugobj_lock_classes(cachep);
+ } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes(cachep);
+
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_create);
#if DEBUG
static void check_irq_off(void)
@@ -2321,7 +2434,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+ assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
#endif
}
@@ -2329,7 +2442,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&cachep->nodelists[node]->list_lock);
+ assert_spin_locked(&cachep->node[node]->list_lock);
#endif
}
@@ -2340,7 +2453,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif
-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
struct array_cache *ac,
int force, int node);
@@ -2348,33 +2461,33 @@ static void do_drain(void *arg)
{
struct kmem_cache *cachep = arg;
struct array_cache *ac;
- int node = numa_node_id();
+ int node = numa_mem_id();
check_irq_off();
ac = cpu_cache_get(cachep);
- spin_lock(&cachep->nodelists[node]->list_lock);
+ spin_lock(&cachep->node[node]->list_lock);
free_block(cachep, ac->entry, ac->avail, node);
- spin_unlock(&cachep->nodelists[node]->list_lock);
+ spin_unlock(&cachep->node[node]->list_lock);
ac->avail = 0;
}
static void drain_cpu_caches(struct kmem_cache *cachep)
{
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
int node;
- on_each_cpu(do_drain, cachep, 1, 1);
+ on_each_cpu(do_drain, cachep, 1);
check_irq_on();
for_each_online_node(node) {
- l3 = cachep->nodelists[node];
- if (l3 && l3->alien)
- drain_alien_cache(cachep, l3->alien);
+ n = cachep->node[node];
+ if (n && n->alien)
+ drain_alien_cache(cachep, n->alien);
}
for_each_online_node(node) {
- l3 = cachep->nodelists[node];
- if (l3)
- drain_array(cachep, l3, l3->shared, 1, node);
+ n = cachep->node[node];
+ if (n)
+ drain_array(cachep, n, n->shared, 1, node);
}
}
@@ -2385,170 +2498,139 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
* Returns the actual number of slabs released.
*/
static int drain_freelist(struct kmem_cache *cache,
- struct kmem_list3 *l3, int tofree)
+ struct kmem_cache_node *n, int tofree)
{
struct list_head *p;
int nr_freed;
- struct slab *slabp;
+ struct page *page;
nr_freed = 0;
- while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
+ while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&l3->list_lock);
- p = l3->slabs_free.prev;
- if (p == &l3->slabs_free) {
- spin_unlock_irq(&l3->list_lock);
+ spin_lock_irq(&n->list_lock);
+ p = n->slabs_free.prev;
+ if (p == &n->slabs_free) {
+ spin_unlock_irq(&n->list_lock);
goto out;
}
- slabp = list_entry(p, struct slab, list);
+ page = list_entry(p, struct page, lru);
#if DEBUG
- BUG_ON(slabp->inuse);
+ BUG_ON(page->active);
#endif
- list_del(&slabp->list);
+ list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
*/
- l3->free_objects -= cache->num;
- spin_unlock_irq(&l3->list_lock);
- slab_destroy(cache, slabp);
+ n->free_objects -= cache->num;
+ spin_unlock_irq(&n->list_lock);
+ slab_destroy(cache, page);
nr_freed++;
}
out:
return nr_freed;
}
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
drain_cpu_caches(cachep);
check_irq_on();
for_each_online_node(i) {
- l3 = cachep->nodelists[i];
- if (!l3)
+ n = cachep->node[i];
+ if (!n)
continue;
- drain_freelist(cachep, l3, l3->free_objects);
+ drain_freelist(cachep, n, slabs_tofree(cachep, n));
- ret += !list_empty(&l3->slabs_full) ||
- !list_empty(&l3->slabs_partial);
+ ret += !list_empty(&n->slabs_full) ||
+ !list_empty(&n->slabs_partial);
}
return (ret ? 1 : 0);
}
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- BUG_ON(!cachep || in_interrupt());
-
- return __cache_shrink(cachep);
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
+ int i;
+ struct kmem_cache_node *n;
+ int rc = __kmem_cache_shrink(cachep);
-/**
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded. This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that noone will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
-{
- BUG_ON(!cachep || in_interrupt());
+ if (rc)
+ return rc;
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
+ for_each_online_cpu(i)
+ kfree(cachep->array[i]);
- /* Find the cache in the chain of caches. */
- mutex_lock(&cache_chain_mutex);
- /*
- * the chain is never empty, cache_cache is never destroyed
- */
- list_del(&cachep->next);
- mutex_unlock(&cache_chain_mutex);
-
- if (__cache_shrink(cachep)) {
- slab_error(cachep, "Can't free all objects");
- mutex_lock(&cache_chain_mutex);
- list_add(&cachep->next, &cache_chain);
- mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
- return;
+ /* NUMA: free the node structures */
+ for_each_online_node(i) {
+ n = cachep->node[i];
+ if (n) {
+ kfree(n->shared);
+ free_alien_cache(n->alien);
+ kfree(n);
+ }
}
-
- if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
- synchronize_rcu();
-
- __kmem_cache_destroy(cachep);
- unlock_cpu_hotplug();
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
/*
* Get the memory for a slab management obj.
- * For a slab cache when the slab descriptor is off-slab, slab descriptors
- * always come from malloc_sizes caches. The slab descriptor cannot
- * come from the same cache which is getting created because,
- * when we are searching for an appropriate cache for these
- * descriptors in kmem_cache_create, we search through the malloc_sizes array.
- * If we are creating a malloc_sizes cache here it would not be visible to
- * kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have slabp_cache same as the original cache.
+ *
+ * For a slab cache when the slab descriptor is off-slab, the
+ * slab descriptor can't come from the same cache which is being created,
+ * Because if it is the case, that means we defer the creation of
+ * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
+ * And we eventually call down to __kmem_cache_create(), which
+ * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * This is a "chicken-and-egg" problem.
+ *
+ * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
+ * which are all initialized during kmem_cache_init().
*/
-static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
- int colour_off, gfp_t local_flags,
- int nodeid)
+static void *alloc_slabmgmt(struct kmem_cache *cachep,
+ struct page *page, int colour_off,
+ gfp_t local_flags, int nodeid)
{
- struct slab *slabp;
+ void *freelist;
+ void *addr = page_address(page);
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- slabp = kmem_cache_alloc_node(cachep->slabp_cache,
+ freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
- if (!slabp)
+ if (!freelist)
return NULL;
} else {
- slabp = objp + colour_off;
- colour_off += cachep->slab_size;
+ freelist = addr + colour_off;
+ colour_off += cachep->freelist_size;
}
- slabp->inuse = 0;
- slabp->colouroff = colour_off;
- slabp->s_mem = objp + colour_off;
- slabp->nodeid = nodeid;
- return slabp;
+ page->active = 0;
+ page->s_mem = addr + colour_off;
+ return freelist;
+}
+
+static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
+{
+ return ((freelist_idx_t *)page->freelist)[idx];
}
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+static inline void set_free_obj(struct page *page,
+ unsigned int idx, freelist_idx_t val)
{
- return (kmem_bufctl_t *) (slabp + 1);
+ ((freelist_idx_t *)(page->freelist))[idx] = val;
}
static void cache_init_objs(struct kmem_cache *cachep,
- struct slab *slabp, unsigned long ctor_flags)
+ struct page *page)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, slabp, i);
+ void *objp = index_to_obj(cachep, page, i);
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2566,8 +2648,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
* They must also be threaded.
*/
if (cachep->ctor && !(cachep->flags & SLAB_POISON))
- cachep->ctor(objp + obj_offset(cachep), cachep,
- ctor_flags);
+ cachep->ctor(objp + obj_offset(cachep));
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2577,130 +2658,108 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+ if ((cachep->size % PAGE_SIZE) == 0 &&
OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 0);
+ cachep->size / PAGE_SIZE, 0);
#else
if (cachep->ctor)
- cachep->ctor(objp, cachep, ctor_flags);
+ cachep->ctor(objp);
#endif
- slab_bufctl(slabp)[i] = i + 1;
+ set_obj_status(page, i, OBJECT_FREE);
+ set_free_obj(page, i, i);
}
- slab_bufctl(slabp)[i - 1] = BUFCTL_END;
- slabp->free = 0;
}
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
{
- if (flags & SLAB_DMA)
- BUG_ON(!(cachep->gfpflags & GFP_DMA));
- else
- BUG_ON(cachep->gfpflags & GFP_DMA);
+ if (CONFIG_ZONE_DMA_FLAG) {
+ if (flags & GFP_DMA)
+ BUG_ON(!(cachep->allocflags & GFP_DMA));
+ else
+ BUG_ON(cachep->allocflags & GFP_DMA);
+ }
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
int nodeid)
{
- void *objp = index_to_obj(cachep, slabp, slabp->free);
- kmem_bufctl_t next;
+ void *objp;
- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
+ objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
+ page->active++;
#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
- WARN_ON(slabp->nodeid != nodeid);
+ WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
#endif
- slabp->free = next;
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
void *objp, int nodeid)
{
- unsigned int objnr = obj_to_index(cachep, slabp, objp);
-
+ unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
- /* Verify that the slab belongs to the intended node */
- WARN_ON(slabp->nodeid != nodeid);
+ unsigned int i;
- if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
- BUG();
+ /* Verify that the slab belongs to the intended node */
+ WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+
+ /* Verify double free bug */
+ for (i = page->active; i < cachep->num; i++) {
+ if (get_free_obj(page, i) == objnr) {
+ printk(KERN_ERR "slab: double free detected in cache "
+ "'%s', objp %p\n", cachep->name, objp);
+ BUG();
+ }
}
#endif
- slab_bufctl(slabp)[objnr] = slabp->free;
- slabp->free = objnr;
- slabp->inuse--;
+ page->active--;
+ set_free_obj(page, page->active, objnr);
}
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
-static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
- void *addr)
+static void slab_map_pages(struct kmem_cache *cache, struct page *page,
+ void *freelist)
{
- int nr_pages;
- struct page *page;
-
- page = virt_to_page(addr);
-
- nr_pages = 1;
- if (likely(!PageCompound(page)))
- nr_pages <<= cache->gfporder;
-
- do {
- page_set_cache(page, cache);
- page_set_slab(page, slab);
- page++;
- } while (--nr_pages);
+ page->slab_cache = cache;
+ page->freelist = freelist;
}
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+ gfp_t flags, int nodeid, struct page *page)
{
- struct slab *slabp;
- void *objp;
+ void *freelist;
size_t offset;
gfp_t local_flags;
- unsigned long ctor_flags;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW));
- if (flags & SLAB_NO_GROW)
- return 0;
-
- ctor_flags = SLAB_CTOR_CONSTRUCTOR;
- local_flags = (flags & SLAB_LEVEL_MASK);
- if (!(local_flags & __GFP_WAIT))
- /*
- * Not allowed to sleep. Need to tell a constructor about
- * this - it might need to know...
- */
- ctor_flags |= SLAB_CTOR_ATOMIC;
+ BUG_ON(flags & GFP_SLAB_BUG_MASK);
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
- /* Take the l3 list lock to change the colour_next on this node */
+ /* Take the node list lock to change the colour_next on this node */
check_irq_off();
- l3 = cachep->nodelists[nodeid];
- spin_lock(&l3->list_lock);
+ n = cachep->node[nodeid];
+ spin_lock(&n->list_lock);
/* Get colour for the slab, and cal the next value. */
- offset = l3->colour_next;
- l3->colour_next++;
- if (l3->colour_next >= cachep->colour)
- l3->colour_next = 0;
- spin_unlock(&l3->list_lock);
+ offset = n->colour_next;
+ n->colour_next++;
+ if (n->colour_next >= cachep->colour)
+ n->colour_next = 0;
+ spin_unlock(&n->list_lock);
offset *= cachep->colour_off;
@@ -2719,33 +2778,34 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- objp = kmem_getpages(cachep, flags, nodeid);
- if (!objp)
+ if (!page)
+ page = kmem_getpages(cachep, local_flags, nodeid);
+ if (!page)
goto failed;
/* Get slab management. */
- slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
- if (!slabp)
+ freelist = alloc_slabmgmt(cachep, page, offset,
+ local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+ if (!freelist)
goto opps1;
- slabp->nodeid = nodeid;
- slab_map_pages(cachep, slabp, objp);
+ slab_map_pages(cachep, page, freelist);
- cache_init_objs(cachep, slabp, ctor_flags);
+ cache_init_objs(cachep, page);
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- spin_lock(&l3->list_lock);
+ spin_lock(&n->list_lock);
/* Make slab active. */
- list_add_tail(&slabp->list, &(l3->slabs_free));
+ list_add_tail(&page->lru, &(n->slabs_free));
STATS_INC_GROWN(cachep);
- l3->free_objects += cachep->num;
- spin_unlock(&l3->list_lock);
+ n->free_objects += cachep->num;
+ spin_unlock(&n->list_lock);
return 1;
opps1:
- kmem_freepages(cachep, objp);
+ kmem_freepages(cachep, page);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
@@ -2758,28 +2818,19 @@ failed:
* Perform extra freeing checks:
* - detect bad pointers.
* - POISON/RED_ZONE checking
- * - destructor calls, for caches with POISON+dtor
*/
static void kfree_debugcheck(const void *objp)
{
- struct page *page;
-
if (!virt_addr_valid(objp)) {
printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
(unsigned long)objp);
BUG();
}
- page = virt_to_page(objp);
- if (!PageSlab(page)) {
- printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
- (unsigned long)objp);
- BUG();
- }
}
static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
{
- unsigned long redzone1, redzone2;
+ unsigned long long redzone1, redzone2;
redzone1 = *dbg_redzone1(cache, obj);
redzone2 = *dbg_redzone2(cache, obj);
@@ -2795,22 +2846,21 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+ printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
obj, redzone1, redzone2);
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
- void *caller)
+ unsigned long caller)
{
- struct page *page;
unsigned int objnr;
- struct slab *slabp;
+ struct page *page;
+
+ BUG_ON(virt_to_cache(objp) != cachep);
objp -= obj_offset(cachep);
kfree_debugcheck(objp);
- page = virt_to_page(objp);
-
- slabp = page_get_slab(page);
+ page = virt_to_head_page(objp);
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
@@ -2818,37 +2868,20 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
- objnr = obj_to_index(cachep, slabp, objp);
+ objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
+ BUG_ON(objp != index_to_obj(cachep, page, objnr));
- if (cachep->flags & SLAB_DEBUG_INITIAL) {
- /*
- * Need to call the slab's constructor so the caller can
- * perform a verify of its state (debugging). Called without
- * the cache-lock held.
- */
- cachep->ctor(objp + obj_offset(cachep),
- cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
- }
- if (cachep->flags & SLAB_POISON && cachep->dtor) {
- /* we want to cache poison the object,
- * call the destruction callback
- */
- cachep->dtor(objp + obj_offset(cachep), cachep, 0);
- }
-#ifdef CONFIG_DEBUG_SLAB_LEAK
- slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
-#endif
+ set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, (unsigned long)caller);
+ if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+ store_stackinfo(cachep, objp, caller);
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 0);
+ cachep->size / PAGE_SIZE, 0);
} else {
poison_obj(cachep, objp, POISON_FREE);
}
@@ -2859,48 +2892,25 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
return objp;
}
-static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
-{
- kmem_bufctl_t i;
- int entries = 0;
-
- /* Check slab's freelist to see if this obj is there. */
- for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
- entries++;
- if (entries > cachep->num || i >= cachep->num)
- goto bad;
- }
- if (entries != cachep->num - slabp->inuse) {
-bad:
- printk(KERN_ERR "slab: Internal list corruption detected in "
- "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
- cachep->name, cachep->num, slabp, slabp->inuse);
- for (i = 0;
- i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
- i++) {
- if (i % 16 == 0)
- printk("\n%03x:", i);
- printk(" %02x", ((unsigned char *)slabp)[i]);
- }
- printk("\n");
- BUG();
- }
-}
#else
#define kfree_debugcheck(x) do { } while(0)
#define cache_free_debugcheck(x,objp,z) (objp)
-#define check_slabp(x,y) do { } while(0)
#endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+ bool force_refill)
{
int batchcount;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
struct array_cache *ac;
+ int node;
check_irq_off();
- ac = cpu_cache_get(cachep);
+ node = numa_mem_id();
+ if (unlikely(force_refill))
+ goto force_grow;
retry:
+ ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
@@ -2910,67 +2920,80 @@ retry:
*/
batchcount = BATCHREFILL_LIMIT;
}
- l3 = cachep->nodelists[numa_node_id()];
+ n = cachep->node[node];
- BUG_ON(ac->avail > 0 || !l3);
- spin_lock(&l3->list_lock);
+ BUG_ON(ac->avail > 0 || !n);
+ spin_lock(&n->list_lock);
/* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+ if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
+ n->shared->touched = 1;
goto alloc_done;
+ }
while (batchcount > 0) {
struct list_head *entry;
- struct slab *slabp;
+ struct page *page;
/* Get slab alloc is to come from. */
- entry = l3->slabs_partial.next;
- if (entry == &l3->slabs_partial) {
- l3->free_touched = 1;
- entry = l3->slabs_free.next;
- if (entry == &l3->slabs_free)
+ entry = n->slabs_partial.next;
+ if (entry == &n->slabs_partial) {
+ n->free_touched = 1;
+ entry = n->slabs_free.next;
+ if (entry == &n->slabs_free)
goto must_grow;
}
- slabp = list_entry(entry, struct slab, list);
- check_slabp(cachep, slabp);
+ page = list_entry(entry, struct page, lru);
check_spinlock_acquired(cachep);
- while (slabp->inuse < cachep->num && batchcount--) {
+
+ /*
+ * The slab was either on partial or free list so
+ * there must be at least one object available for
+ * allocation.
+ */
+ BUG_ON(page->active >= cachep->num);
+
+ while (page->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
- numa_node_id());
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
+ node));
}
- check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &l3->slabs_full);
+ list_del(&page->lru);
+ if (page->active == cachep->num)
+ list_add(&page->lru, &n->slabs_full);
else
- list_add(&slabp->list, &l3->slabs_partial);
+ list_add(&page->lru, &n->slabs_partial);
}
must_grow:
- l3->free_objects -= ac->avail;
+ n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&l3->list_lock);
+ spin_unlock(&n->list_lock);
if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, numa_node_id());
+force_grow:
+ x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
- if (!x && ac->avail == 0) /* no objects in sight? abort */
+ node = numa_mem_id();
+
+ /* no objects in sight? abort */
+ if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
- return ac->entry[--ac->avail];
+
+ return ac_get_obj(cachep, ac, flags, force_refill);
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2984,15 +3007,17 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
- gfp_t flags, void *objp, void *caller)
+ gfp_t flags, void *objp, unsigned long caller)
{
+ struct page *page;
+
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+ if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 1);
+ cachep->size / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -3001,7 +3026,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3009,31 +3034,23 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
slab_error(cachep, "double free, or memory outside"
" object was overwritten");
printk(KERN_ERR
- "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+ "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
objp, *dbg_redzone1(cachep, objp),
*dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
-#ifdef CONFIG_DEBUG_SLAB_LEAK
- {
- struct slab *slabp;
- unsigned objnr;
- slabp = page_get_slab(virt_to_page(objp));
- objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
- slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
- }
-#endif
+ page = virt_to_head_page(objp);
+ set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
- if (cachep->ctor && cachep->flags & SLAB_POISON) {
- unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
-
- if (!(flags & __GFP_WAIT))
- ctor_flags |= SLAB_CTOR_ATOMIC;
-
- cachep->ctor(objp, cachep, ctor_flags);
+ if (cachep->ctor && cachep->flags & SLAB_POISON)
+ cachep->ctor(objp);
+ if (ARCH_SLAB_MINALIGN &&
+ ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
+ printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
}
@@ -3041,56 +3058,60 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
+static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
+{
+ if (cachep == kmem_cache)
+ return false;
+
+ return should_failslab(cachep->object_size, flags, cachep->flags);
+}
+
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
+ bool force_refill = false;
check_irq_off();
+
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
- STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac->entry[--ac->avail];
- } else {
- STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags);
- }
- return objp;
-}
+ objp = ac_get_obj(cachep, ac, flags, false);
-static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
- gfp_t flags, void *caller)
-{
- unsigned long save_flags;
- void *objp = NULL;
-
- cache_alloc_debugcheck_before(cachep, flags);
-
- local_irq_save(save_flags);
+ /*
+ * Allow for the possibility all avail objects are not allowed
+ * by the current flags
+ */
+ if (objp) {
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
+ }
+ force_refill = true;
+ }
- if (unlikely(NUMA_BUILD &&
- current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
- objp = alternate_node_alloc(cachep, flags);
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags, force_refill);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
- if (!objp)
- objp = ____cache_alloc(cachep, flags);
+out:
/*
- * We may just have run out of memory on the local node.
- * __cache_alloc_node() knows how to locate memory on other nodes
+ * To avoid a false negative, if an object that is in one of the
+ * per-CPU caches is leaked, we need to make sure kmemleak doesn't
+ * treat the array pointers as a reference to the object.
*/
- if (NUMA_BUILD && !objp)
- objp = __cache_alloc_node(cachep, flags, numa_node_id());
- local_irq_restore(save_flags);
- objp = cache_alloc_debugcheck_after(cachep, flags, objp,
- caller);
- prefetchw(objp);
+ if (objp)
+ kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3101,148 +3122,331 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
- nid_alloc = nid_here = numa_node_id();
+ nid_alloc = nid_here = numa_mem_id();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
- nid_alloc = cpuset_mem_spread_node();
+ nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = slab_node(current->mempolicy);
+ nid_alloc = mempolicy_slab_node();
if (nid_alloc != nid_here)
- return __cache_alloc_node(cachep, flags, nid_alloc);
+ return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
}
/*
* Fallback function if there was no memory available and no objects on a
- * certain node and we are allowed to fall back. We mimick the behavior of
- * the page allocator. We fall back according to a zonelist determined by
- * the policy layer while obeying cpuset constraints.
+ * certain node and fall back is permitted. First we scan all the
+ * available node for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
*/
-void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
{
- struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
- ->node_zonelists[gfp_zone(flags)];
- struct zone **z;
+ struct zonelist *zonelist;
+ gfp_t local_flags;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
+ int nid;
+ unsigned int cpuset_mems_cookie;
+
+ if (flags & __GFP_THISNODE)
+ return NULL;
+
+ local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
- for (z = zonelist->zones; *z && !obj; z++)
- if (zone_idx(*z) <= ZONE_NORMAL &&
- cpuset_zone_allowed(*z, flags))
- obj = __cache_alloc_node(cache,
- flags | __GFP_THISNODE,
- zone_to_nid(*z));
+retry:
+ /*
+ * Look through allowed nodes for objects available
+ * from existing per node queues.
+ */
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ nid = zone_to_nid(zone);
+
+ if (cpuset_zone_allowed_hardwall(zone, flags) &&
+ cache->node[nid] &&
+ cache->node[nid]->free_objects) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (obj)
+ break;
+ }
+ }
+
+ if (!obj) {
+ /*
+ * This allocation will be performed within the constraints
+ * of the current cpuset / memory policy requirements.
+ * We may trigger various forms of reclaim on the allowed
+ * set and go into memory reserves if necessary.
+ */
+ struct page *page;
+
+ if (local_flags & __GFP_WAIT)
+ local_irq_enable();
+ kmem_flagcheck(cache, flags);
+ page = kmem_getpages(cache, local_flags, numa_mem_id());
+ if (local_flags & __GFP_WAIT)
+ local_irq_disable();
+ if (page) {
+ /*
+ * Insert into the appropriate per node queues
+ */
+ nid = page_to_nid(page);
+ if (cache_grow(cache, flags, nid, page)) {
+ obj = ____cache_alloc_node(cache,
+ flags | GFP_THISNODE, nid);
+ if (!obj)
+ /*
+ * Another processor may allocate the
+ * objects in the slab since we are
+ * not holding any locks.
+ */
+ goto retry;
+ } else {
+ /* cache_grow already freed obj */
+ obj = NULL;
+ }
+ }
+ }
+
+ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
return obj;
}
/*
* A interface to enable slab creation on nodeid
*/
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct list_head *entry;
- struct slab *slabp;
- struct kmem_list3 *l3;
+ struct page *page;
+ struct kmem_cache_node *n;
void *obj;
int x;
- l3 = cachep->nodelists[nodeid];
- BUG_ON(!l3);
+ VM_BUG_ON(nodeid > num_online_nodes());
+ n = cachep->node[nodeid];
+ BUG_ON(!n);
retry:
check_irq_off();
- spin_lock(&l3->list_lock);
- entry = l3->slabs_partial.next;
- if (entry == &l3->slabs_partial) {
- l3->free_touched = 1;
- entry = l3->slabs_free.next;
- if (entry == &l3->slabs_free)
+ spin_lock(&n->list_lock);
+ entry = n->slabs_partial.next;
+ if (entry == &n->slabs_partial) {
+ n->free_touched = 1;
+ entry = n->slabs_free.next;
+ if (entry == &n->slabs_free)
goto must_grow;
}
- slabp = list_entry(entry, struct slab, list);
+ page = list_entry(entry, struct page, lru);
check_spinlock_acquired_node(cachep, nodeid);
- check_slabp(cachep, slabp);
STATS_INC_NODEALLOCS(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- BUG_ON(slabp->inuse == cachep->num);
+ BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, slabp, nodeid);
- check_slabp(cachep, slabp);
- l3->free_objects--;
+ obj = slab_get_obj(cachep, page, nodeid);
+ n->free_objects--;
/* move slabp to correct slabp list: */
- list_del(&slabp->list);
+ list_del(&page->lru);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &l3->slabs_full);
+ if (page->active == cachep->num)
+ list_add(&page->lru, &n->slabs_full);
else
- list_add(&slabp->list, &l3->slabs_partial);
+ list_add(&page->lru, &n->slabs_partial);
- spin_unlock(&l3->list_lock);
+ spin_unlock(&n->list_lock);
goto done;
must_grow:
- spin_unlock(&l3->list_lock);
- x = cache_grow(cachep, flags, nodeid);
+ spin_unlock(&n->list_lock);
+ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
if (x)
goto retry;
- if (!(flags & __GFP_THISNODE))
- /* Unable to grow the cache. Fall back to other nodes. */
- return fallback_alloc(cachep, flags);
-
- return NULL;
+ return fallback_alloc(cachep, flags);
done:
return obj;
}
-#endif
+
+static __always_inline void *
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ unsigned long caller)
+{
+ unsigned long save_flags;
+ void *ptr;
+ int slab_node = numa_mem_id();
+
+ flags &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(flags);
+
+ if (slab_should_failslab(cachep, flags))
+ return NULL;
+
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+
+ if (nodeid == NUMA_NO_NODE)
+ nodeid = slab_node;
+
+ if (unlikely(!cachep->node[nodeid])) {
+ /* Node not bootstrapped yet */
+ ptr = fallback_alloc(cachep, flags);
+ goto out;
+ }
+
+ if (nodeid == slab_node) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ if (ptr)
+ goto out;
+ }
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+ out:
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+ kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
+ flags);
+
+ if (likely(ptr)) {
+ kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(ptr, 0, cachep->object_size);
+ }
+
+ return ptr;
+}
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+ void *objp;
+
+ if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
+ objp = alternate_node_alloc(cache, flags);
+ if (objp)
+ goto out;
+ }
+ objp = ____cache_alloc(cache, flags);
+
+ /*
+ * We may just have run out of memory on the local node.
+ * ____cache_alloc_node() knows how to locate memory on other nodes
+ */
+ if (!objp)
+ objp = ____cache_alloc_node(cache, flags, numa_mem_id());
+
+ out:
+ return objp;
+}
+#else
+
+static __always_inline void *
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+ return ____cache_alloc(cachep, flags);
+}
+
+#endif /* CONFIG_NUMA */
+
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+{
+ unsigned long save_flags;
+ void *objp;
+
+ flags &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(flags);
+
+ if (slab_should_failslab(cachep, flags))
+ return NULL;
+
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ objp = __do_cache_alloc(cachep, flags);
+ local_irq_restore(save_flags);
+ objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+ kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
+ flags);
+ prefetchw(objp);
+
+ if (likely(objp)) {
+ kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(objp, 0, cachep->object_size);
+ }
+
+ return objp;
+}
/*
- * Caller needs to acquire correct kmem_list's list_lock
+ * Caller needs to acquire correct kmem_cache_node's list_lock
*/
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
{
int i;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
for (i = 0; i < nr_objects; i++) {
- void *objp = objpp[i];
- struct slab *slabp;
+ void *objp;
+ struct page *page;
+
+ clear_obj_pfmemalloc(&objpp[i]);
+ objp = objpp[i];
- slabp = virt_to_slab(objp);
- l3 = cachep->nodelists[node];
- list_del(&slabp->list);
+ page = virt_to_head_page(objp);
+ n = cachep->node[node];
+ list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- check_slabp(cachep, slabp);
- slab_put_obj(cachep, slabp, objp, node);
+ slab_put_obj(cachep, page, objp, node);
STATS_DEC_ACTIVE(cachep);
- l3->free_objects++;
- check_slabp(cachep, slabp);
+ n->free_objects++;
/* fixup slab chains */
- if (slabp->inuse == 0) {
- if (l3->free_objects > l3->free_limit) {
- l3->free_objects -= cachep->num;
+ if (page->active == 0) {
+ if (n->free_objects > n->free_limit) {
+ n->free_objects -= cachep->num;
/* No need to drop any previously held
* lock here, even if we have a off-slab slab
* descriptor it is guaranteed to come from
* a different cache, refer to comments before
* alloc_slabmgmt.
*/
- slab_destroy(cachep, slabp);
+ slab_destroy(cachep, page);
} else {
- list_add(&slabp->list, &l3->slabs_free);
+ list_add(&page->lru, &n->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&slabp->list, &l3->slabs_partial);
+ list_add_tail(&page->lru, &n->slabs_partial);
}
}
}
@@ -3250,18 +3454,18 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
- struct kmem_list3 *l3;
- int node = numa_node_id();
+ struct kmem_cache_node *n;
+ int node = numa_mem_id();
batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- l3 = cachep->nodelists[node];
- spin_lock(&l3->list_lock);
- if (l3->shared) {
- struct array_cache *shared_array = l3->shared;
+ n = cachep->node[node];
+ spin_lock(&n->list_lock);
+ if (n->shared) {
+ struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
@@ -3280,12 +3484,12 @@ free_done:
int i = 0;
struct list_head *p;
- p = l3->slabs_free.next;
- while (p != &(l3->slabs_free)) {
- struct slab *slabp;
+ p = n->slabs_free.next;
+ while (p != &(n->slabs_free)) {
+ struct page *page;
- slabp = list_entry(p, struct slab, list);
- BUG_ON(slabp->inuse);
+ page = list_entry(p, struct page, lru);
+ BUG_ON(page->active);
i++;
p = p->next;
@@ -3293,7 +3497,7 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&l3->list_lock);
+ spin_unlock(&n->list_lock);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
@@ -3302,25 +3506,35 @@ free_done:
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
- objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+ kmemleak_free_recursive(objp, cachep->flags);
+ objp = cache_free_debugcheck(cachep, objp, caller);
- if (cache_free_alien(cachep, objp))
+ kmemcheck_slab_free(cachep, objp, cachep->object_size);
+
+ /*
+ * Skip calling cache_free_alien() when the platform is not numa.
+ * This will avoid cache misses that happen while accessing slabp (which
+ * is per page memory reference) to get nodeid. Instead use a global
+ * variable to skip the call, which is mostly likely to be present in
+ * the cache.
+ */
+ if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac->entry[ac->avail++] = objp;
- return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac->entry[ac->avail++] = objp;
}
+
+ ac_put_obj(cachep, ac, objp);
}
/**
@@ -3333,68 +3547,29 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
+ void *ret = slab_alloc(cachep, flags, _RET_IP_);
+
+ trace_kmem_cache_alloc(_RET_IP_, ret,
+ cachep->object_size, cachep->size, flags);
-/**
- * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
- * @cache: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache and set the allocated memory to zero.
- * The flags are only relevant if the cache has no available objects.
- */
-void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
- if (ret)
- memset(ret, 0, obj_size(cache));
return ret;
}
-EXPORT_SYMBOL(kmem_cache_zalloc);
+EXPORT_SYMBOL(kmem_cache_alloc);
-/**
- * kmem_ptr_validate - check if an untrusted pointer might
- * be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane:
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
+#ifdef CONFIG_TRACING
+void *
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = BYTES_PER_WORD - 1;
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ void *ret;
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ ret = slab_alloc(cachep, flags, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret,
+ size, cachep->size, flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
/**
@@ -3403,45 +3578,73 @@ out:
* @flags: See kmalloc().
* @nodeid: node number of the target node.
*
- * Identical to kmem_cache_alloc, except that this function is slow
- * and can sleep. And it will allocate memory on the given node, which
- * can improve the performance for cpu bound structures.
- * New and improved: it will now make sure that the object gets
- * put on the correct node list so that there is no false sharing.
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- unsigned long save_flags;
- void *ptr;
+ void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
+ trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ cachep->object_size, cachep->size,
+ flags, nodeid);
- if (nodeid == -1 || nodeid == numa_node_id() ||
- !cachep->nodelists[nodeid])
- ptr = ____cache_alloc(cachep, flags);
- else
- ptr = __cache_alloc_node(cachep, flags, nodeid);
- local_irq_restore(save_flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
- __builtin_return_address(0));
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid,
+ size_t size)
+{
+ void *ret;
- return ptr;
+ ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, cachep->size,
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
- cachep = kmem_find_general_cachep(size, flags);
- if (unlikely(cachep == NULL))
- return NULL;
- return kmem_cache_alloc_node(cachep, flags, node);
+ cachep = kmalloc_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+}
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+#else
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, 0);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
+#endif /* CONFIG_NUMA */
/**
* __do_kmalloc - allocate memory
@@ -3450,38 +3653,42 @@ EXPORT_SYMBOL(__kmalloc_node);
* @caller: function caller for debug tracking of the caller
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- void *caller)
+ unsigned long caller)
{
struct kmem_cache *cachep;
+ void *ret;
- /* If you want to save a few bytes .text space: replace
- * __ with kmem_.
- * Then kmalloc uses the uninlined functions instead of the inline
- * functions.
- */
- cachep = __find_general_cachep(size, flags);
- if (unlikely(cachep == NULL))
- return NULL;
- return __cache_alloc(cachep, flags, caller);
+ cachep = kmalloc_slab(size, flags);
+ if (unlikely(ZERO_OR_NULL_PTR(cachep)))
+ return cachep;
+ ret = slab_alloc(cachep, flags, caller);
+
+ trace_kmalloc(caller, ret,
+ size, cachep->size, flags);
+
+ return ret;
}
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
-#ifndef CONFIG_DEBUG_SLAB
- return __do_kmalloc(size, flags, NULL);
-#else
- return __do_kmalloc(size, flags, __builtin_return_address(0));
-#endif
+ return __do_kmalloc(size, flags, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc);
-#ifdef CONFIG_DEBUG_SLAB
-void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
{
return __do_kmalloc(size, flags, caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#else
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc(size, flags, 0);
+}
+EXPORT_SYMBOL(__kmalloc);
#endif
/**
@@ -3495,12 +3702,18 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
-
- BUG_ON(virt_to_cache(objp) != cachep);
+ cachep = cache_from_obj(cachep, objp);
+ if (!cachep)
+ return;
local_irq_save(flags);
- __cache_free(cachep, objp);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
+
+ trace_kmem_cache_free(_RET_IP_, objp);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3518,105 +3731,102 @@ void kfree(const void *objp)
struct kmem_cache *c;
unsigned long flags;
- if (unlikely(!objp))
+ trace_kfree(_RET_IP_, objp);
+
+ if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
- debug_check_no_locks_freed(objp, obj_size(c));
- __cache_free(c, (void *)objp);
+ debug_check_no_locks_freed(objp, c->object_size);
+
+ debug_check_no_obj_freed(objp, c->object_size);
+ __cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
-unsigned int kmem_cache_size(struct kmem_cache *cachep)
-{
- return obj_size(cachep);
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
-const char *kmem_cache_name(struct kmem_cache *cachep)
-{
- return cachep->name;
-}
-EXPORT_SYMBOL_GPL(kmem_cache_name);
-
/*
- * This initializes kmem_list3 or resizes varioius caches for all nodes.
+ * This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
struct array_cache *new_shared;
- struct array_cache **new_alien;
+ struct array_cache **new_alien = NULL;
for_each_online_node(node) {
- new_alien = alloc_alien_cache(node, cachep->limit);
- if (!new_alien)
- goto fail;
+ if (use_alien_caches) {
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+ if (!new_alien)
+ goto fail;
+ }
- new_shared = alloc_arraycache(node,
+ new_shared = NULL;
+ if (cachep->shared) {
+ new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
- 0xbaadf00d);
- if (!new_shared) {
- free_alien_cache(new_alien);
- goto fail;
+ 0xbaadf00d, gfp);
+ if (!new_shared) {
+ free_alien_cache(new_alien);
+ goto fail;
+ }
}
- l3 = cachep->nodelists[node];
- if (l3) {
- struct array_cache *shared = l3->shared;
+ n = cachep->node[node];
+ if (n) {
+ struct array_cache *shared = n->shared;
- spin_lock_irq(&l3->list_lock);
+ spin_lock_irq(&n->list_lock);
if (shared)
free_block(cachep, shared->entry,
shared->avail, node);
- l3->shared = new_shared;
- if (!l3->alien) {
- l3->alien = new_alien;
+ n->shared = new_shared;
+ if (!n->alien) {
+ n->alien = new_alien;
new_alien = NULL;
}
- l3->free_limit = (1 + nr_cpus_node(node)) *
+ n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
- spin_unlock_irq(&l3->list_lock);
+ spin_unlock_irq(&n->list_lock);
kfree(shared);
free_alien_cache(new_alien);
continue;
}
- l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
- if (!l3) {
+ n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+ if (!n) {
free_alien_cache(new_alien);
kfree(new_shared);
goto fail;
}
- kmem_list3_init(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
- l3->shared = new_shared;
- l3->alien = new_alien;
- l3->free_limit = (1 + nr_cpus_node(node)) *
+ kmem_cache_node_init(n);
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+ n->shared = new_shared;
+ n->alien = new_alien;
+ n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
- cachep->nodelists[node] = l3;
+ cachep->node[node] = n;
}
return 0;
fail:
- if (!cachep->next.next) {
+ if (!cachep->list.next) {
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
- if (cachep->nodelists[node]) {
- l3 = cachep->nodelists[node];
+ if (cachep->node[node]) {
+ n = cachep->node[node];
- kfree(l3->shared);
- free_alien_cache(l3->alien);
- kfree(l3);
- cachep->nodelists[node] = NULL;
+ kfree(n->shared);
+ free_alien_cache(n->alien);
+ kfree(n);
+ cachep->node[node] = NULL;
}
node--;
}
@@ -3626,7 +3836,7 @@ fail:
struct ccupdate_struct {
struct kmem_cache *cachep;
- struct array_cache *new[NR_CPUS];
+ struct array_cache *new[0];
};
static void do_ccupdate_local(void *info)
@@ -3641,20 +3851,21 @@ static void do_ccupdate_local(void *info)
new->new[smp_processor_id()] = old;
}
-/* Always called with the cache_chain_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared)
+/* Always called with the slab_mutex held */
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;
- new = kzalloc(sizeof(*new), GFP_KERNEL);
+ new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+ gfp);
if (!new)
return -ENOMEM;
for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
- batchcount);
+ new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
+ batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
@@ -3664,7 +3875,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
}
new->cachep = cachep;
- on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+ on_each_cpu(do_ccupdate_local, (void *)new, 1);
check_irq_on();
cachep->batchcount = batchcount;
@@ -3675,21 +3886,58 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
- spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
+ spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep);
+ return alloc_kmem_cache_node(cachep, gfp);
}
-/* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
+{
+ int ret;
+ struct kmem_cache *c = NULL;
+ int i = 0;
+
+ ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+
+ if (slab_state < FULL)
+ return ret;
+
+ if ((ret < 0) || !is_root_cache(cachep))
+ return ret;
+
+ VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(cachep, i);
+ if (c)
+ /* return value determined by the parent cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ }
+
+ return ret;
+}
+
+/* Called with slab_mutex held always */
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
- int limit, shared;
+ int limit = 0;
+ int shared = 0;
+ int batchcount = 0;
+
+ if (!is_root_cache(cachep)) {
+ struct kmem_cache *root = memcg_root_cache(cachep);
+ limit = root->limit;
+ shared = root->shared;
+ batchcount = root->batchcount;
+ }
+ if (limit && shared && batchcount)
+ goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -3699,13 +3947,13 @@ static int enable_cpucache(struct kmem_cache *cachep)
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
- if (cachep->buffer_size > 131072)
+ if (cachep->size > 131072)
limit = 1;
- else if (cachep->buffer_size > PAGE_SIZE)
+ else if (cachep->size > PAGE_SIZE)
limit = 8;
- else if (cachep->buffer_size > 1024)
+ else if (cachep->size > 1024)
limit = 24;
- else if (cachep->buffer_size > 256)
+ else if (cachep->size > 256)
limit = 54;
else
limit = 120;
@@ -3720,10 +3968,8 @@ static int enable_cpucache(struct kmem_cache *cachep)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
-#ifdef CONFIG_SMP
- if (cachep->buffer_size <= PAGE_SIZE)
+ if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
-#endif
#if DEBUG
/*
@@ -3733,7 +3979,9 @@ static int enable_cpucache(struct kmem_cache *cachep)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+ batchcount = (limit + 1) / 2;
+skip_setup:
+ err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -3741,11 +3989,11 @@ static int enable_cpucache(struct kmem_cache *cachep)
}
/*
- * Drain an array if it contains any elements taking the l3 lock only if
- * necessary. Note that the l3 listlock also protects the array_cache
+ * Drain an array if it contains any elements taking the node lock only if
+ * necessary. Note that the node listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
struct array_cache *ac, int force, int node)
{
int tofree;
@@ -3755,7 +4003,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
if (ac->touched && !force) {
ac->touched = 0;
} else {
- spin_lock_irq(&l3->list_lock);
+ spin_lock_irq(&n->list_lock);
if (ac->avail) {
tofree = force ? ac->avail : (ac->limit + 4) / 5;
if (tofree > ac->avail)
@@ -3765,13 +4013,13 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void *) * ac->avail);
}
- spin_unlock_irq(&l3->list_lock);
+ spin_unlock_irq(&n->list_lock);
}
}
/**
* cache_reap - Reclaim memory from caches.
- * @unused: unused parameter
+ * @w: work descriptor
*
* Called from workqueue/eventd every few seconds.
* Purpose:
@@ -3781,50 +4029,48 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
* If we cannot acquire the cache chain mutex then just give up - we'll try
* again on the next iteration.
*/
-static void cache_reap(void *unused)
+static void cache_reap(struct work_struct *w)
{
struct kmem_cache *searchp;
- struct kmem_list3 *l3;
- int node = numa_node_id();
+ struct kmem_cache_node *n;
+ int node = numa_mem_id();
+ struct delayed_work *work = to_delayed_work(w);
- if (!mutex_trylock(&cache_chain_mutex)) {
+ if (!mutex_trylock(&slab_mutex))
/* Give up. Setup the next iteration. */
- schedule_delayed_work(&__get_cpu_var(reap_work),
- REAPTIMEOUT_CPUC);
- return;
- }
+ goto out;
- list_for_each_entry(searchp, &cache_chain, next) {
+ list_for_each_entry(searchp, &slab_caches, list) {
check_irq_on();
/*
- * We only take the l3 lock if absolutely necessary and we
+ * We only take the node lock if absolutely necessary and we
* have established with reasonable certainty that
* we can do some work if the lock was obtained.
*/
- l3 = searchp->nodelists[node];
+ n = searchp->node[node];
- reap_alien(searchp, l3);
+ reap_alien(searchp, n);
- drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
+ drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
/*
* These are racy checks but it does not matter
* if we skip one check or scan twice.
*/
- if (time_after(l3->next_reap, jiffies))
+ if (time_after(n->next_reap, jiffies))
goto next;
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE;
- drain_array(searchp, l3, l3->shared, 0, node);
+ drain_array(searchp, n, n->shared, 0, node);
- if (l3->free_touched)
- l3->free_touched = 0;
+ if (n->free_touched)
+ n->free_touched = 0;
else {
int freed;
- freed = drain_freelist(searchp, l3, (l3->free_limit +
+ freed = drain_freelist(searchp, n, (n->free_limit +
5 * searchp->num - 1) / (5 * searchp->num));
STATS_ADD_REAPED(searchp, freed);
}
@@ -3832,72 +4078,17 @@ next:
cond_resched();
}
check_irq_on();
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
next_reap_node();
- refresh_cpu_vm_stats(smp_processor_id());
+out:
/* Set up the next iteration */
- schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
-}
-
-#ifdef CONFIG_PROC_FS
-
-static void print_slabinfo_header(struct seq_file *m)
-{
- /*
- * Output format version, so at least we can change it
- * without _too_ many complaints.
- */
-#if STATS
- seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
- seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
- seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
- seq_putc(m, '\n');
+ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
}
-static void *s_start(struct seq_file *m, loff_t *pos)
+#ifdef CONFIG_SLABINFO
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- loff_t n = *pos;
- struct list_head *p;
-
- mutex_lock(&cache_chain_mutex);
- if (!n)
- print_slabinfo_header(m);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct kmem_cache *cachep = p;
- ++*pos;
- return cachep->next.next == &cache_chain ?
- NULL : list_entry(cachep->next.next, struct kmem_cache, next);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&cache_chain_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
-{
- struct kmem_cache *cachep = p;
- struct slab *slabp;
+ struct page *page;
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
@@ -3905,42 +4096,42 @@ static int s_show(struct seq_file *m, void *p)
const char *name;
char *error = NULL;
int node;
- struct kmem_list3 *l3;
+ struct kmem_cache_node *n;
active_objs = 0;
num_slabs = 0;
for_each_online_node(node) {
- l3 = cachep->nodelists[node];
- if (!l3)
+ n = cachep->node[node];
+ if (!n)
continue;
check_irq_on();
- spin_lock_irq(&l3->list_lock);
+ spin_lock_irq(&n->list_lock);
- list_for_each_entry(slabp, &l3->slabs_full, list) {
- if (slabp->inuse != cachep->num && !error)
+ list_for_each_entry(page, &n->slabs_full, lru) {
+ if (page->active != cachep->num && !error)
error = "slabs_full accounting error";
active_objs += cachep->num;
active_slabs++;
}
- list_for_each_entry(slabp, &l3->slabs_partial, list) {
- if (slabp->inuse == cachep->num && !error)
- error = "slabs_partial inuse accounting error";
- if (!slabp->inuse && !error)
- error = "slabs_partial/inuse accounting error";
- active_objs += slabp->inuse;
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (page->active == cachep->num && !error)
+ error = "slabs_partial accounting error";
+ if (!page->active && !error)
+ error = "slabs_partial accounting error";
+ active_objs += page->active;
active_slabs++;
}
- list_for_each_entry(slabp, &l3->slabs_free, list) {
- if (slabp->inuse && !error)
- error = "slabs_free/inuse accounting error";
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (page->active && !error)
+ error = "slabs_free accounting error";
num_slabs++;
}
- free_objects += l3->free_objects;
- if (l3->shared)
- shared_avail += l3->shared->avail;
+ free_objects += n->free_objects;
+ if (n->shared)
+ shared_avail += n->shared->avail;
- spin_unlock_irq(&l3->list_lock);
+ spin_unlock_irq(&n->list_lock);
}
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
@@ -3951,15 +4142,22 @@ static int s_show(struct seq_file *m, void *p)
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- name, active_objs, num_objs, cachep->buffer_size,
- cachep->num, (1 << cachep->gfporder));
- seq_printf(m, " : tunables %4u %4u %4u",
- cachep->limit, cachep->batchcount, cachep->shared);
- seq_printf(m, " : slabdata %6lu %6lu %6lu",
- active_slabs, num_slabs, shared_avail);
+ sinfo->active_objs = active_objs;
+ sinfo->num_objs = num_objs;
+ sinfo->active_slabs = active_slabs;
+ sinfo->num_slabs = num_slabs;
+ sinfo->shared_avail = shared_avail;
+ sinfo->limit = cachep->limit;
+ sinfo->batchcount = cachep->batchcount;
+ sinfo->shared = cachep->shared;
+ sinfo->objects_per_slab = cachep->num;
+ sinfo->cache_order = cachep->gfporder;
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
#if STATS
- { /* list3 stats */
+ { /* node stats */
unsigned long high = cachep->high_mark;
unsigned long allocs = cachep->num_allocations;
unsigned long grown = cachep->grown;
@@ -3970,10 +4168,11 @@ static int s_show(struct seq_file *m, void *p)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
- reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+ "%4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
+ reaped, errors, max_freeable, node_allocs,
+ node_frees, overflows);
}
/* cpu stats */
{
@@ -3986,31 +4185,8 @@ static int s_show(struct seq_file *m, void *p)
allochit, allocmiss, freehit, freemiss);
}
#endif
- seq_putc(m, '\n');
- return 0;
}
-/*
- * slabinfo_op - iterator that generates /proc/slabinfo
- *
- * Output layout:
- * cache-name
- * num-active-objs
- * total-objs
- * object size
- * num-active-slabs
- * total-slabs
- * num-pages-per-slab
- * + further values on SMP and with statistics enabled
- */
-
-struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
#define MAX_SLABINFO_WRITE 128
/**
* slabinfo_write - Tuning for the slab allocator
@@ -4019,7 +4195,7 @@ struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4041,21 +4217,22 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
return -EINVAL;
/* Find the cache in the chain of caches. */
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
res = -EINVAL;
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
if (!strcmp(cachep->name, kbuf)) {
if (limit < 1 || batchcount < 1 ||
batchcount > limit || shared < 0) {
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
- batchcount, shared);
+ batchcount, shared,
+ GFP_KERNEL);
}
break;
}
}
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
if (res >= 0)
res = count;
return res;
@@ -4065,17 +4242,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
static void *leaks_start(struct seq_file *m, loff_t *pos)
{
- loff_t n = *pos;
- struct list_head *p;
-
- mutex_lock(&cache_chain_mutex);
- p = cache_chain.next;
- while (n--) {
- p = p->next;
- if (p == &cache_chain)
- return NULL;
- }
- return list_entry(p, struct kmem_cache, next);
+ mutex_lock(&slab_mutex);
+ return seq_list_start(&slab_caches, *pos);
}
static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4108,15 +4276,18 @@ static inline int add_caller(unsigned long *n, unsigned long v)
return 1;
}
-static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+static void handle_slab(unsigned long *n, struct kmem_cache *c,
+ struct page *page)
{
void *p;
int i;
+
if (n[0] == n[1])
return;
- for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
- if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+ for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
+ if (get_obj_status(page, i) != OBJECT_ACTIVE)
continue;
+
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
return;
}
@@ -4125,16 +4296,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
static void show_symbol(struct seq_file *m, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
- char *modname;
- const char *name;
unsigned long offset, size;
- char namebuf[KSYM_NAME_LEN+1];
-
- name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+ char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
- if (name) {
+ if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
- if (modname)
+ if (modname[0])
seq_printf(m, " [%s]", modname);
return;
}
@@ -4144,11 +4311,11 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = p;
- struct slab *slabp;
- struct kmem_list3 *l3;
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
+ struct page *page;
+ struct kmem_cache_node *n;
const char *name;
- unsigned long *n = m->private;
+ unsigned long *x = m->private;
int node;
int i;
@@ -4159,56 +4326,89 @@ static int leaks_show(struct seq_file *m, void *p)
/* OK, we can do it */
- n[1] = 0;
+ x[1] = 0;
for_each_online_node(node) {
- l3 = cachep->nodelists[node];
- if (!l3)
+ n = cachep->node[node];
+ if (!n)
continue;
check_irq_on();
- spin_lock_irq(&l3->list_lock);
+ spin_lock_irq(&n->list_lock);
- list_for_each_entry(slabp, &l3->slabs_full, list)
- handle_slab(n, cachep, slabp);
- list_for_each_entry(slabp, &l3->slabs_partial, list)
- handle_slab(n, cachep, slabp);
- spin_unlock_irq(&l3->list_lock);
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
}
name = cachep->name;
- if (n[0] == n[1]) {
+ if (x[0] == x[1]) {
/* Increase the buffer size */
- mutex_unlock(&cache_chain_mutex);
- m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+ mutex_unlock(&slab_mutex);
+ m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
if (!m->private) {
/* Too bad, we are really out */
- m->private = n;
- mutex_lock(&cache_chain_mutex);
+ m->private = x;
+ mutex_lock(&slab_mutex);
return -ENOMEM;
}
- *(unsigned long *)m->private = n[0] * 2;
- kfree(n);
- mutex_lock(&cache_chain_mutex);
+ *(unsigned long *)m->private = x[0] * 2;
+ kfree(x);
+ mutex_lock(&slab_mutex);
/* Now make sure this entry will be retried */
m->count = m->size;
return 0;
}
- for (i = 0; i < n[1]; i++) {
- seq_printf(m, "%s: %lu ", name, n[2*i+3]);
- show_symbol(m, n[2*i+2]);
+ for (i = 0; i < x[1]; i++) {
+ seq_printf(m, "%s: %lu ", name, x[2*i+3]);
+ show_symbol(m, x[2*i+2]);
seq_putc(m, '\n');
}
return 0;
}
-struct seq_operations slabstats_op = {
+static const struct seq_operations slabstats_op = {
.start = leaks_start,
- .next = s_next,
- .stop = s_stop,
+ .next = slab_next,
+ .stop = slab_stop,
.show = leaks_show,
};
+
+static int slabstats_open(struct inode *inode, struct file *file)
+{
+ unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ int ret = -ENOMEM;
+ if (n) {
+ ret = seq_open(file, &slabstats_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+ m->private = n;
+ n = NULL;
+ }
+ kfree(n);
+ }
+ return ret;
+}
+
+static const struct file_operations proc_slabstats_operations = {
+ .open = slabstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
#endif
+
+static int __init slab_proc_init(void)
+{
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
+#endif
+ return 0;
+}
+module_init(slab_proc_init);
#endif
/**
@@ -4223,10 +4423,12 @@ struct seq_operations slabstats_op = {
* allocated with either kmalloc() or kmem_cache_alloc(). The object
* must not be freed during the duration of the call.
*/
-unsigned int ksize(const void *objp)
+size_t ksize(const void *objp)
{
- if (unlikely(objp == NULL))
+ BUG_ON(!objp);
+ if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- return obj_size(virt_to_cache(objp));
+ return virt_to_cache(objp)->object_size;
}
+EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 00000000000..961a3fb1f5a
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,298 @@
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+ DOWN, /* No slab functionality yet */
+ PARTIAL, /* SLUB: kmem_cache_node available */
+ PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
+ PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
+ UP, /* Slab caches usable but not all extras yet */
+ FULL /* Everything is working */
+};
+
+extern enum slab_state slab_state;
+
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+
+/* The list of all slab caches on the system */
+extern struct list_head slab_caches;
+
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size);
+
+#ifndef CONFIG_SLOB
+/* Kmalloc array related functions */
+void create_kmalloc_caches(unsigned long);
+
+/* Find the kmalloc slab corresponding for a certain size */
+struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+#endif
+
+
+/* Functions provided by the slab allocators */
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+
+extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+ size_t size, unsigned long flags);
+
+struct mem_cgroup;
+#ifdef CONFIG_SLUB
+struct kmem_cache *
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+
+
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_NOTRACK)
+#else
+#define SLAB_CACHE_FLAGS (0)
+#endif
+
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+
+int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);
+void slab_kmem_cache_release(struct kmem_cache *);
+
+struct seq_file;
+struct file;
+
+struct slabinfo {
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs;
+ unsigned long num_slabs;
+ unsigned long shared_avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int shared;
+ unsigned int objects_per_slab;
+ unsigned int cache_order;
+};
+
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return (p == s) ||
+ (s->memcg_params && (p == s->memcg_params->root_cache));
+}
+
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->name;
+ return s->name;
+}
+
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
+ if (!s->memcg_params)
+ return NULL;
+
+ rcu_read_lock();
+ params = rcu_dereference(s->memcg_params);
+ cachep = params->memcg_caches[idx];
+ rcu_read_unlock();
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match this (see
+ * memcg_register_cache()).
+ */
+ smp_read_barrier_depends();
+ return cachep;
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s))
+ return s;
+ return s->memcg_params->root_cache;
+}
+
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+ gfp_t gfp, int order)
+{
+ if (!memcg_kmem_enabled())
+ return 0;
+ if (is_root_cache(s))
+ return 0;
+ return __memcg_charge_slab(s, gfp, order);
+}
+
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+ if (!memcg_kmem_enabled())
+ return;
+ if (is_root_cache(s))
+ return;
+ __memcg_uncharge_slab(s, order);
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return true;
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return true;
+}
+
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ return s->name;
+}
+
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
+{
+ return NULL;
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ return s;
+}
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
+#endif
+
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ /*
+ * When kmemcg is not being used, both assignments should return the
+ * same value. but we don't want to pay the assignment price in that
+ * case. If it is not compiled in, the compiler should be smart enough
+ * to not do even the assignment. In that case, slab_equal_or_root
+ * will also be a constant.
+ */
+ if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ return s;
+
+ page = virt_to_head_page(x);
+ cachep = page->slab_cache;
+ if (slab_equal_or_root(cachep, s))
+ return cachep;
+
+ pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+ __FUNCTION__, cachep->name, s->name);
+ WARN_ON_ONCE(1);
+ return s;
+}
+#endif
+
+
+/*
+ * The slab lists for all objects.
+ */
+struct kmem_cache_node {
+ spinlock_t list_lock;
+
+#ifdef CONFIG_SLAB
+ struct list_head slabs_partial; /* partial list first, better asm code */
+ struct list_head slabs_full;
+ struct list_head slabs_free;
+ unsigned long free_objects;
+ unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
+ struct array_cache *shared; /* shared per node */
+ struct array_cache **alien; /* on other nodes */
+ unsigned long next_reap; /* updated without locking */
+ int free_touched; /* updated without locking */
+#endif
+
+#ifdef CONFIG_SLUB
+ unsigned long nr_partial;
+ struct list_head partial;
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_t nr_slabs;
+ atomic_long_t total_objects;
+ struct list_head full;
+#endif
+#endif
+
+};
+
+void *slab_next(struct seq_file *m, void *p, loff_t *pos);
+void slab_stop(struct seq_file *m, void *p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 00000000000..d31c4bacc6a
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,789 @@
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include <linux/memcontrol.h>
+#include <trace/events/kmem.h>
+
+#include "slab.h"
+
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, size_t size)
+{
+ struct kmem_cache *s = NULL;
+
+ if (!name || in_interrupt() || size < sizeof(void *) ||
+ size > KMALLOC_MAX_SIZE) {
+ pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(s, &slab_caches, list) {
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(s->name, tmp);
+ if (res) {
+ pr_err("Slab cache with size %d has lost its name\n",
+ s->object_size);
+ continue;
+ }
+
+#if !defined(CONFIG_SLUB)
+ if (!strcmp(s->name, name)) {
+ pr_err("%s (%s): Cache name already exists.\n",
+ __func__, name);
+ dump_stack();
+ s = NULL;
+ return -EINVAL;
+ }
+#endif
+ }
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+ return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+ struct kmem_cache *s;
+ int ret = 0;
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ ret = memcg_update_cache_size(s, num_memcgs);
+ /*
+ * See comment in memcontrol.c, memcg_update_cache_size:
+ * Instead of freeing the memory, we'll just leave the caches
+ * up to this point in an updated state.
+ */
+ if (ret)
+ goto out;
+ }
+
+ memcg_update_array_size(num_memcgs);
+out:
+ mutex_unlock(&slab_mutex);
+ return ret;
+}
+#endif
+
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ int err;
+
+ err = -ENOMEM;
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (!s)
+ goto out;
+
+ s->name = name;
+ s->object_size = object_size;
+ s->size = size;
+ s->align = align;
+ s->ctor = ctor;
+
+ err = memcg_alloc_cache_params(memcg, s, root_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+out:
+ if (err)
+ return ERR_PTR(err);
+ return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s);
+ goto out;
+}
+
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+ char *cache_name;
+ int err;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ err = kmem_cache_sanity_check(name, size);
+ if (err)
+ goto out_unlock;
+
+ /*
+ * Some allocators will constraint the set of valid flags to a subset
+ * of all flags. We expect them to define CACHE_CREATE_MASK in this
+ * case, and we'll just provide them with a sanitized version of the
+ * passed flags.
+ */
+ flags &= CACHE_CREATE_MASK;
+
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
+ if (s)
+ goto out_unlock;
+
+ cache_name = kstrdup(name, GFP_KERNEL);
+ if (!cache_name) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ s = do_kmem_cache_create(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ kfree(cache_name);
+ }
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ if (err) {
+ if (flags & SLAB_PANIC)
+ panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
+ name, err);
+ else {
+ printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ name, err);
+ dump_stack();
+ }
+ return NULL;
+ }
+ return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * memcg_create_kmem_cache - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ * @memcg_name: The name of the memory cgroup (used for naming the new cache).
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache,
+ const char *memcg_name)
+{
+ struct kmem_cache *s = NULL;
+ char *cache_name;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ memcg_cache_id(memcg), memcg_name);
+ if (!cache_name)
+ goto out_unlock;
+
+ s = do_kmem_cache_create(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
+ if (IS_ERR(s)) {
+ kfree(cache_name);
+ s = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ return s;
+}
+
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
+{
+ int rc;
+
+ if (!s->memcg_params ||
+ !s->memcg_params->is_root_cache)
+ return 0;
+
+ mutex_unlock(&slab_mutex);
+ rc = __memcg_cleanup_cache_params(s);
+ mutex_lock(&slab_mutex);
+
+ return rc;
+}
+#else
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+void slab_kmem_cache_release(struct kmem_cache *s)
+{
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+}
+
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ s->refcount--;
+ if (s->refcount)
+ goto out_unlock;
+
+ if (memcg_cleanup_cache_params(s) != 0)
+ goto out_unlock;
+
+ if (__kmem_cache_shutdown(s) != 0) {
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ goto out_unlock;
+ }
+
+ list_del(&s->list);
+
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_free_cache_params(s);
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_remove(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ goto out;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+out:
+ put_online_mems();
+ put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+
+ get_online_cpus();
+ get_online_mems();
+ ret = __kmem_cache_shrink(cachep);
+ put_online_mems();
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+int slab_is_available(void)
+{
+ return slab_state >= UP;
+}
+
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
+ unsigned long flags)
+{
+ int err;
+
+ s->name = name;
+ s->size = s->object_size = size;
+ s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+ err = __kmem_cache_create(s, flags);
+
+ if (err)
+ panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
+ name, size, err);
+
+ s->refcount = -1; /* Exempt from merging for now */
+}
+
+struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags)
+{
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ if (!s)
+ panic("Out of memory when creating slab %s\n", name);
+
+ create_boot_cache(s, name, size, flags);
+ list_add(&s->list, &slab_caches);
+ s->refcount = 1;
+ return s;
+}
+
+struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_caches);
+
+#ifdef CONFIG_ZONE_DMA
+struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_dma_caches);
+#endif
+
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+ 3, /* 8 */
+ 4, /* 16 */
+ 5, /* 24 */
+ 5, /* 32 */
+ 6, /* 40 */
+ 6, /* 48 */
+ 6, /* 56 */
+ 6, /* 64 */
+ 1, /* 72 */
+ 1, /* 80 */
+ 1, /* 88 */
+ 1, /* 96 */
+ 7, /* 104 */
+ 7, /* 112 */
+ 7, /* 120 */
+ 7, /* 128 */
+ 2, /* 136 */
+ 2, /* 144 */
+ 2, /* 152 */
+ 2, /* 160 */
+ 2, /* 168 */
+ 2, /* 176 */
+ 2, /* 184 */
+ 2 /* 192 */
+};
+
+static inline int size_index_elem(size_t bytes)
+{
+ return (bytes - 1) / 8;
+}
+
+/*
+ * Find the kmem_cache structure that serves a given size of
+ * allocation
+ */
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+{
+ int index;
+
+ if (unlikely(size > KMALLOC_MAX_SIZE)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ if (size <= 192) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ index = size_index[size_index_elem(size)];
+ } else
+ index = fls(size - 1);
+
+#ifdef CONFIG_ZONE_DMA
+ if (unlikely((flags & GFP_DMA)))
+ return kmalloc_dma_caches[index];
+
+#endif
+ return kmalloc_caches[index];
+}
+
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(unsigned long flags)
+{
+ int i;
+
+ /*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+ (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+ for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+ int elem = size_index_elem(i);
+
+ if (elem >= ARRAY_SIZE(size_index))
+ break;
+ size_index[elem] = KMALLOC_SHIFT_LOW;
+ }
+
+ if (KMALLOC_MIN_SIZE >= 64) {
+ /*
+ * The 96 byte size cache is not used if the alignment
+ * is 64 byte.
+ */
+ for (i = 64 + 8; i <= 96; i += 8)
+ size_index[size_index_elem(i)] = 7;
+
+ }
+
+ if (KMALLOC_MIN_SIZE >= 128) {
+ /*
+ * The 192 byte sized cache is not used if the alignment
+ * is 128 byte. Redirect kmalloc to use the 256 byte cache
+ * instead.
+ */
+ for (i = 128 + 8; i <= 192; i += 8)
+ size_index[size_index_elem(i)] = 8;
+ }
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[i]) {
+ kmalloc_caches[i] = create_kmalloc_cache(NULL,
+ 1 << i, flags);
+ }
+
+ /*
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
+ */
+ if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
+ kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
+
+ if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
+ kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
+ }
+
+ /* Kmalloc array is now usable */
+ slab_state = UP;
+
+ for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+ char *n;
+
+ if (s) {
+ n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
+
+ BUG_ON(!n);
+ s->name = n;
+ }
+ }
+
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+
+ if (s) {
+ int size = kmalloc_size(i);
+ char *n = kasprintf(GFP_NOWAIT,
+ "dma-kmalloc-%d", size);
+
+ BUG_ON(!n);
+ kmalloc_dma_caches[i] = create_kmalloc_cache(n,
+ size, SLAB_CACHE_DMA | flags);
+ }
+ }
+#endif
+}
+#endif /* !CONFIG_SLOB */
+
+/*
+ * To avoid unnecessary overhead, we pass through large allocation requests
+ * directly to the page allocator. We use __GFP_COMP, because we will need to
+ * know the allocation order to free the pages properly in kfree.
+ */
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret;
+ struct page *page;
+
+ flags |= __GFP_COMP;
+ page = alloc_kmem_pages(flags, order);
+ ret = page ? page_address(page) : NULL;
+ kmemleak_alloc(ret, size, 1, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
+#ifdef CONFIG_TRACING
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
+#endif
+
+#ifdef CONFIG_SLABINFO
+
+#ifdef CONFIG_SLAB
+#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
+#else
+#define SLABINFO_RIGHTS S_IRUSR
+#endif
+
+void print_slabinfo_header(struct seq_file *m)
+{
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+ seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+ "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ mutex_lock(&slab_mutex);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+void slab_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+ struct kmem_cache *c;
+ struct slabinfo sinfo;
+ int i;
+
+ if (!is_root_cache(s))
+ return;
+
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(c, &sinfo);
+
+ info->active_slabs += sinfo.active_slabs;
+ info->num_slabs += sinfo.num_slabs;
+ info->shared_avail += sinfo.shared_avail;
+ info->active_objs += sinfo.active_objs;
+ info->num_objs += sinfo.num_objs;
+ }
+}
+
+int cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+ struct slabinfo sinfo;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(s, &sinfo);
+
+ memcg_accumulate_slabinfo(s, &sinfo);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ sinfo.objects_per_slab, (1 << sinfo.cache_order));
+
+ seq_printf(m, " : tunables %4u %4u %4u",
+ sinfo.limit, sinfo.batchcount, sinfo.shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+ slabinfo_show_stats(m, s);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+ if (!is_root_cache(s))
+ return 0;
+ return cache_show(s, m);
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = slab_next,
+ .stop = slab_stop,
+ .show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .write = slabinfo_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
+ &proc_slabinfo_operations);
+ return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 542394184a5..21980e0f39a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,338 +3,642 @@
*
* Matt Mackall <mpm@selenic.com> 12/30/03
*
+ * NUMA support by Paul Mundt, 2007.
+ *
* How SLOB works:
*
* The core of SLOB is a traditional K&R style heap allocator, with
* support for returning aligned objects. The granularity of this
- * allocator is 8 bytes on x86, though it's perhaps possible to reduce
- * this to 4 if it's deemed worth the effort. The slob heap is a
- * singly-linked list of pages from __get_free_page, grown on demand
- * and allocation from the heap is currently first-fit.
+ * allocator is as little as 2 bytes, however typically most architectures
+ * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
+ *
+ * The slob heap is a set of linked list of pages from alloc_pages(),
+ * and within each page, there is a singly-linked list of free blocks
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
*
* Above this is an implementation of kmalloc/kfree. Blocks returned
- * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
- * __get_free_pages directly so that it can return page-aligned blocks
- * and keeps a linked list of such pages and their orders. These
- * objects are detected in kfree() by their page alignment.
+ * alloc_pages() directly, allocating compound pages so the page order
+ * does not have to be separately tracked.
+ * These objects are detected in kfree() because PageSlab()
+ * is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
- * destructors for every SLAB allocation. Objects are returned with
- * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
- * set, in which case the low-level allocator will fragment blocks to
- * create the proper alignment. Again, objects of page-size or greater
- * are allocated by calling __get_free_pages. As SLAB objects know
- * their size, no separate size bookkeeping is necessary and there is
- * essentially no allocation space overhead.
+ * destructors for every SLAB allocation. Objects are returned with the
+ * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
+ * case the low-level allocator will fragment blocks to create the proper
+ * alignment. Again, objects of page-size or greater are allocated by
+ * calling alloc_pages(). As SLAB objects know their size, no separate
+ * size bookkeeping is necessary and there is essentially no allocation
+ * space overhead, and compound pages aren't needed for multi-page
+ * allocations.
+ *
+ * NUMA support in SLOB is fairly simplistic, pushing most of the real
+ * logic down to the page allocator, and simply doing the node accounting
+ * on the upper levels. In the event that a node id is explicitly
+ * provided, alloc_pages_exact_node() with the specified node id is used
+ * instead. The common case (or when the node id isn't explicitly provided)
+ * will default to the current node, as per numa_node_id().
+ *
+ * Node aware pages are still inserted in to the global freelist, and
+ * these are scanned for by matching against the node id encoded in the
+ * page flags. As a result, block allocations that can be satisfied from
+ * the freelist will only be done so on pages residing on the same node,
+ * in order to prevent random node placement.
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
+
#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
#include <linux/cache.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/timer.h>
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/kmemleak.h>
+
+#include <trace/events/kmem.h>
+
+#include <linux/atomic.h>
+
+#include "slab.h"
+/*
+ * slob_block has a field 'units', which indicates size of block if +ve,
+ * or offset of next block if -ve (in SLOB_UNITs).
+ *
+ * Free blocks of size 1 unit simply contain the offset of the next block.
+ * Those with larger size contain their size in the first SLOB_UNIT of
+ * memory, and the offset of the next free block in the second SLOB_UNIT.
+ */
+#if PAGE_SIZE <= (32767 * 2)
+typedef s16 slobidx_t;
+#else
+typedef s32 slobidx_t;
+#endif
struct slob_block {
- int units;
- struct slob_block *next;
+ slobidx_t units;
};
typedef struct slob_block slob_t;
+/*
+ * All partially free slob pages go on these lists.
+ */
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
+
+/*
+ * slob_page_free: true for pages on free_slob_pages list.
+ */
+static inline int slob_page_free(struct page *sp)
+{
+ return PageSlobFree(sp);
+}
+
+static void set_slob_page_free(struct page *sp, struct list_head *list)
+{
+ list_add(&sp->lru, list);
+ __SetPageSlobFree(sp);
+}
+
+static inline void clear_slob_page_free(struct page *sp)
+{
+ list_del(&sp->lru);
+ __ClearPageSlobFree(sp);
+}
+
#define SLOB_UNIT sizeof(slob_t)
-#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
-#define SLOB_ALIGN L1_CACHE_BYTES
+#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
-struct bigblock {
- int order;
- void *pages;
- struct bigblock *next;
+/*
+ * struct slob_rcu is inserted at the tail of allocated slob blocks, which
+ * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
+ * the block using call_rcu.
+ */
+struct slob_rcu {
+ struct rcu_head head;
+ int size;
};
-typedef struct bigblock bigblock_t;
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
+/*
+ * slob_lock protects all slob allocator structures.
+ */
static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
-static void slob_free(void *b, int size);
+/*
+ * Encode the given size and next info into a free slob block s.
+ */
+static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t offset = next - base;
+
+ if (size > 1) {
+ s[0].units = size;
+ s[1].units = offset;
+ } else
+ s[0].units = -offset;
+}
+
+/*
+ * Return the size of a slob block.
+ */
+static slobidx_t slob_units(slob_t *s)
+{
+ if (s->units > 0)
+ return s->units;
+ return 1;
+}
+
+/*
+ * Return the next free slob block pointer after this one.
+ */
+static slob_t *slob_next(slob_t *s)
+{
+ slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
+ slobidx_t next;
+
+ if (s[0].units < 0)
+ next = -s[0].units;
+ else
+ next = s[1].units;
+ return base+next;
+}
+
+/*
+ * Returns true if s is the last free block in its page.
+ */
+static int slob_last(slob_t *s)
+{
+ return !((unsigned long)slob_next(s) & ~PAGE_MASK);
+}
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- slob_t *prev, *cur, *aligned = 0;
+ void *page;
+
+#ifdef CONFIG_NUMA
+ if (node != NUMA_NO_NODE)
+ page = alloc_pages_exact_node(node, gfp, order);
+ else
+#endif
+ page = alloc_pages(gfp, order);
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+static void slob_free_pages(void *b, int order)
+{
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += 1 << order;
+ free_pages((unsigned long)b, order);
+}
+
+/*
+ * Allocate a slob block within a given slob_page sp.
+ */
+static void *slob_page_alloc(struct page *sp, size_t size, int align)
+{
+ slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
- unsigned long flags;
- spin_lock_irqsave(&slob_lock, flags);
- prev = slobfree;
- for (cur = prev->next; ; prev = cur, cur = cur->next) {
+ for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
+ slobidx_t avail = slob_units(cur);
+
if (align) {
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
delta = aligned - cur;
}
- if (cur->units >= units + delta) { /* room enough? */
+ if (avail >= units + delta) { /* room enough? */
+ slob_t *next;
+
if (delta) { /* need to fragment head to align? */
- aligned->units = cur->units - delta;
- aligned->next = cur->next;
- cur->next = aligned;
- cur->units = delta;
+ next = slob_next(cur);
+ set_slob(aligned, avail - delta, next);
+ set_slob(cur, delta, aligned);
prev = cur;
cur = aligned;
+ avail = slob_units(cur);
}
- if (cur->units == units) /* exact fit? */
- prev->next = cur->next; /* unlink */
- else { /* fragment */
- prev->next = cur + units;
- prev->next->units = cur->units - units;
- prev->next->next = cur->next;
- cur->units = units;
+ next = slob_next(cur);
+ if (avail == units) { /* exact fit? unlink. */
+ if (prev)
+ set_slob(prev, slob_units(prev), next);
+ else
+ sp->freelist = next;
+ } else { /* fragment */
+ if (prev)
+ set_slob(prev, slob_units(prev), cur + units);
+ else
+ sp->freelist = cur + units;
+ set_slob(cur + units, avail - units, next);
}
- slobfree = prev;
- spin_unlock_irqrestore(&slob_lock, flags);
+ sp->units -= units;
+ if (!sp->units)
+ clear_slob_page_free(sp);
return cur;
}
- if (cur == slobfree) {
- spin_unlock_irqrestore(&slob_lock, flags);
+ if (slob_last(cur))
+ return NULL;
+ }
+}
- if (size == PAGE_SIZE) /* trying to shrink arena? */
- return 0;
+/*
+ * slob_alloc: entry point into the slob allocator.
+ */
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+{
+ struct page *sp;
+ struct list_head *prev;
+ struct list_head *slob_list;
+ slob_t *b = NULL;
+ unsigned long flags;
- cur = (slob_t *)__get_free_page(gfp);
- if (!cur)
- return 0;
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
- slob_free(cur, PAGE_SIZE);
- spin_lock_irqsave(&slob_lock, flags);
- cur = slobfree;
- }
+ spin_lock_irqsave(&slob_lock, flags);
+ /* Iterate through each partially free page, try to find room */
+ list_for_each_entry(sp, slob_list, lru) {
+#ifdef CONFIG_NUMA
+ /*
+ * If there's a node specification, search for a partial
+ * page with a matching node id in the freelist.
+ */
+ if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
+ continue;
+#endif
+ /* Enough room on this page? */
+ if (sp->units < SLOB_UNITS(size))
+ continue;
+
+ /* Attempt to alloc */
+ prev = sp->lru.prev;
+ b = slob_page_alloc(sp, size, align);
+ if (!b)
+ continue;
+
+ /* Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449) */
+ if (prev != slob_list->prev &&
+ slob_list->next != prev->next)
+ list_move_tail(slob_list, prev->next);
+ break;
}
+ spin_unlock_irqrestore(&slob_lock, flags);
+
+ /* Not enough space: must allocate a new page */
+ if (!b) {
+ b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
+ if (!b)
+ return NULL;
+ sp = virt_to_page(b);
+ __SetPageSlab(sp);
+
+ spin_lock_irqsave(&slob_lock, flags);
+ sp->units = SLOB_UNITS(PAGE_SIZE);
+ sp->freelist = b;
+ INIT_LIST_HEAD(&sp->lru);
+ set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
+ set_slob_page_free(sp, slob_list);
+ b = slob_page_alloc(sp, size, align);
+ BUG_ON(!b);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ }
+ if (unlikely((gfp & __GFP_ZERO) && b))
+ memset(b, 0, size);
+ return b;
}
+/*
+ * slob_free: entry point into the slob allocator.
+ */
static void slob_free(void *block, int size)
{
- slob_t *cur, *b = (slob_t *)block;
+ struct page *sp;
+ slob_t *prev, *next, *b = (slob_t *)block;
+ slobidx_t units;
unsigned long flags;
+ struct list_head *slob_list;
- if (!block)
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
return;
+ BUG_ON(!size);
- if (size)
- b->units = SLOB_UNITS(size);
+ sp = virt_to_page(block);
+ units = SLOB_UNITS(size);
- /* Find reinsertion point */
spin_lock_irqsave(&slob_lock, flags);
- for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
- if (cur >= cur->next && (b > cur || b < cur->next))
- break;
- if (b + b->units == cur->next) {
- b->units += cur->next->units;
- b->next = cur->next->next;
- } else
- b->next = cur->next;
+ if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
+ /* Go directly to page allocator. Do not pass slob allocator */
+ if (slob_page_free(sp))
+ clear_slob_page_free(sp);
+ spin_unlock_irqrestore(&slob_lock, flags);
+ __ClearPageSlab(sp);
+ page_mapcount_reset(sp);
+ slob_free_pages(b, 0);
+ return;
+ }
- if (cur + cur->units == b) {
- cur->units += b->units;
- cur->next = b->next;
- } else
- cur->next = b;
+ if (!slob_page_free(sp)) {
+ /* This slob page is about to become partially free. Easy! */
+ sp->units = units;
+ sp->freelist = b;
+ set_slob(b, units,
+ (void *)((unsigned long)(b +
+ SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+ set_slob_page_free(sp, slob_list);
+ goto out;
+ }
- slobfree = cur;
+ /*
+ * Otherwise the page is already partially free, so find reinsertion
+ * point.
+ */
+ sp->units += units;
+ if (b < (slob_t *)sp->freelist) {
+ if (b + units == sp->freelist) {
+ units += slob_units(sp->freelist);
+ sp->freelist = slob_next(sp->freelist);
+ }
+ set_slob(b, units, sp->freelist);
+ sp->freelist = b;
+ } else {
+ prev = sp->freelist;
+ next = slob_next(prev);
+ while (b > next) {
+ prev = next;
+ next = slob_next(prev);
+ }
+
+ if (!slob_last(prev) && b + units == next) {
+ units += slob_units(next);
+ set_slob(b, units, slob_next(next));
+ } else
+ set_slob(b, units, next);
+
+ if (prev + slob_units(prev) == b) {
+ units = slob_units(b) + slob_units(prev);
+ set_slob(prev, units, slob_next(b));
+ } else
+ set_slob(prev, slob_units(prev), b);
+ }
+out:
spin_unlock_irqrestore(&slob_lock, flags);
}
-static int FASTCALL(find_order(int size));
-static int fastcall find_order(int size)
-{
- int order = 0;
- for ( ; size > 4096 ; size >>=1)
- order++;
- return order;
-}
+/*
+ * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
+ */
-void *kmalloc(size_t size, gfp_t gfp)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
- slob_t *m;
- bigblock_t *bb;
- unsigned long flags;
+ unsigned int *m;
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ void *ret;
- if (size < PAGE_SIZE - SLOB_UNIT) {
- m = slob_alloc(size + SLOB_UNIT, gfp, 0);
- return m ? (void *)(m + 1) : 0;
- }
+ gfp &= gfp_allowed_mask;
- bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
- if (!bb)
- return 0;
+ lockdep_trace_alloc(gfp);
+
+ if (size < PAGE_SIZE - align) {
+ if (!size)
+ return ZERO_SIZE_PTR;
+
+ m = slob_alloc(size + align, gfp, align, node);
+
+ if (!m)
+ return NULL;
+ *m = size;
+ ret = (void *)m + align;
- bb->order = find_order(size);
- bb->pages = (void *)__get_free_pages(gfp, bb->order);
+ trace_kmalloc_node(caller, ret,
+ size, size + align, gfp, node);
+ } else {
+ unsigned int order = get_order(size);
- if (bb->pages) {
- spin_lock_irqsave(&block_lock, flags);
- bb->next = bigblocks;
- bigblocks = bb;
- spin_unlock_irqrestore(&block_lock, flags);
- return bb->pages;
+ if (likely(order))
+ gfp |= __GFP_COMP;
+ ret = slob_new_pages(gfp, order, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << order, gfp, node);
}
- slob_free(bb, sizeof(bigblock_t));
- return 0;
+ kmemleak_alloc(ret, size, 1, gfp);
+ return ret;
}
-EXPORT_SYMBOL(kmalloc);
+void *__kmalloc(size_t size, gfp_t gfp)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
void kfree(const void *block)
{
- bigblock_t *bb, **last = &bigblocks;
- unsigned long flags;
+ struct page *sp;
- if (!block)
- return;
+ trace_kfree(_RET_IP_, block);
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- /* might be on the big block list */
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
- if (bb->pages == block) {
- *last = bb->next;
- spin_unlock_irqrestore(&block_lock, flags);
- free_pages((unsigned long)block, bb->order);
- slob_free(bb, sizeof(bigblock_t));
- return;
- }
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
+ if (unlikely(ZERO_OR_NULL_PTR(block)))
+ return;
+ kmemleak_free(block);
- slob_free((slob_t *)block - 1, 0);
- return;
+ sp = virt_to_page(block);
+ if (PageSlab(sp)) {
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int *m = (unsigned int *)(block - align);
+ slob_free(m, *m + align);
+ } else
+ __free_pages(sp, compound_order(sp));
}
-
EXPORT_SYMBOL(kfree);
-unsigned int ksize(const void *block)
+/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
+size_t ksize(const void *block)
{
- bigblock_t *bb;
- unsigned long flags;
+ struct page *sp;
+ int align;
+ unsigned int *m;
- if (!block)
+ BUG_ON(!block);
+ if (unlikely(block == ZERO_SIZE_PTR))
return 0;
- if (!((unsigned long)block & (PAGE_SIZE-1))) {
- spin_lock_irqsave(&block_lock, flags);
- for (bb = bigblocks; bb; bb = bb->next)
- if (bb->pages == block) {
- spin_unlock_irqrestore(&slob_lock, flags);
- return PAGE_SIZE << bb->order;
- }
- spin_unlock_irqrestore(&block_lock, flags);
- }
+ sp = virt_to_page(block);
+ if (unlikely(!PageSlab(sp)))
+ return PAGE_SIZE << compound_order(sp);
- return ((slob_t *)block - 1)->units * SLOB_UNIT;
+ align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
}
+EXPORT_SYMBOL(ksize);
-struct kmem_cache {
- unsigned int size, align;
- const char *name;
- void (*ctor)(void *, struct kmem_cache *, unsigned long);
- void (*dtor)(void *, struct kmem_cache *, unsigned long);
-};
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
+{
+ if (flags & SLAB_DESTROY_BY_RCU) {
+ /* leave room for rcu footer at the end of object */
+ c->size += sizeof(struct slob_rcu);
+ }
+ c->flags = flags;
+ return 0;
+}
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags,
- void (*ctor)(void*, struct kmem_cache *, unsigned long),
- void (*dtor)(void*, struct kmem_cache *, unsigned long))
+void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
- struct kmem_cache *c;
-
- c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
-
- if (c) {
- c->name = name;
- c->size = size;
- c->ctor = ctor;
- c->dtor = dtor;
- /* ignore alignment unless it's forced */
- c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
- if (c->align < align)
- c->align = align;
+ void *b;
+
+ flags &= gfp_allowed_mask;
+
+ lockdep_trace_alloc(flags);
+
+ if (c->size < PAGE_SIZE) {
+ b = slob_alloc(c->size, flags, c->align, node);
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ SLOB_UNITS(c->size) * SLOB_UNIT,
+ flags, node);
+ } else {
+ b = slob_new_pages(flags, get_order(c->size), node);
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ PAGE_SIZE << get_order(c->size),
+ flags, node);
}
- return c;
+ if (b && c->ctor)
+ c->ctor(b);
+
+ kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
+ return b;
}
-EXPORT_SYMBOL(kmem_cache_create);
+EXPORT_SYMBOL(slob_alloc_node);
-void kmem_cache_destroy(struct kmem_cache *c)
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- slob_free(c, sizeof(struct kmem_cache));
+ return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
}
-EXPORT_SYMBOL(kmem_cache_destroy);
+EXPORT_SYMBOL(kmem_cache_alloc);
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
- void *b;
-
- if (c->size < PAGE_SIZE)
- b = slob_alloc(c->size, flags, c->align);
- else
- b = (void *)__get_free_pages(flags, find_order(c->size));
+ return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
- if (c->ctor)
- c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
+{
+ return slob_alloc_node(cachep, gfp, node);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
- return b;
+static void __kmem_cache_free(void *b, int size)
+{
+ if (size < PAGE_SIZE)
+ slob_free(b, size);
+ else
+ slob_free_pages(b, get_order(size));
}
-EXPORT_SYMBOL(kmem_cache_alloc);
-void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+static void kmem_rcu_free(struct rcu_head *head)
{
- void *ret = kmem_cache_alloc(c, flags);
- if (ret)
- memset(ret, 0, c->size);
+ struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
+ void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
- return ret;
+ __kmem_cache_free(b, slob_rcu->size);
}
-EXPORT_SYMBOL(kmem_cache_zalloc);
void kmem_cache_free(struct kmem_cache *c, void *b)
{
- if (c->dtor)
- c->dtor(b, c, 0);
+ kmemleak_free_recursive(b, c->flags);
+ if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
+ struct slob_rcu *slob_rcu;
+ slob_rcu = b + (c->size - sizeof(struct slob_rcu));
+ slob_rcu->size = c->size;
+ call_rcu(&slob_rcu->head, kmem_rcu_free);
+ } else {
+ __kmem_cache_free(b, c->size);
+ }
- if (c->size < PAGE_SIZE)
- slob_free(b, c->size);
- else
- free_pages((unsigned long)b, find_order(c->size));
+ trace_kmem_cache_free(_RET_IP_, b);
}
EXPORT_SYMBOL(kmem_cache_free);
-unsigned int kmem_cache_size(struct kmem_cache *c)
+int __kmem_cache_shutdown(struct kmem_cache *c)
{
- return c->size;
+ /* No way to check for remaining objects */
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *c)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
- return c->name;
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_name);
-static struct timer_list slob_timer = TIMER_INITIALIZER(
- (void (*)(unsigned long))kmem_cache_init, 0, 0);
+struct kmem_cache kmem_cache_boot = {
+ .name = "kmem_cache",
+ .size = sizeof(struct kmem_cache),
+ .flags = SLAB_PANIC,
+ .align = ARCH_KMALLOC_MINALIGN,
+};
-void kmem_cache_init(void)
+void __init kmem_cache_init(void)
{
- void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
-
- if (p)
- free_page((unsigned long)p);
+ kmem_cache = &kmem_cache_boot;
+ slab_state = UP;
+}
- mod_timer(&slob_timer, jiffies + HZ);
+void __init kmem_cache_init_late(void)
+{
+ slab_state = FULL;
}
diff --git a/mm/slub.c b/mm/slub.c
new file mode 100644
index 00000000000..73004808537
--- /dev/null
+++ b/mm/slub.c
@@ -0,0 +1,5374 @@
+/*
+ * SLUB: A slab allocator that limits cache line use instead of queuing
+ * objects in per cpu and per node lists.
+ *
+ * The allocator synchronizes using per slab locks or atomic operatios
+ * and only uses a centralized lock to manage a pool of partial slabs.
+ *
+ * (C) 2007 SGI, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
+ */
+
+#include <linux/mm.h>
+#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/module.h>
+#include <linux/bit_spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "slab.h"
+#include <linux/proc_fs.h>
+#include <linux/notifier.h>
+#include <linux/seq_file.h>
+#include <linux/kmemcheck.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/ctype.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/math64.h>
+#include <linux/fault-inject.h>
+#include <linux/stacktrace.h>
+#include <linux/prefetch.h>
+#include <linux/memcontrol.h>
+
+#include <trace/events/kmem.h>
+
+#include "internal.h"
+
+/*
+ * Lock order:
+ * 1. slab_mutex (Global Mutex)
+ * 2. node->list_lock
+ * 3. slab_lock(page) (Only on some arches and for debugging)
+ *
+ * slab_mutex
+ *
+ * The role of the slab_mutex is to protect the list of all the slabs
+ * and to synchronize major metadata changes to slab cache structures.
+ *
+ * The slab_lock is only used for debugging and on arches that do not
+ * have the ability to do a cmpxchg_double. It only protects the second
+ * double word in the page struct. Meaning
+ * A. page->freelist -> List of object free in a page
+ * B. page->counters -> Counters of objects
+ * C. page->frozen -> frozen state
+ *
+ * If a slab is frozen then it is exempt from list management. It is not
+ * on any list. The processor that froze the slab is the one who can
+ * perform list operations on the page. Other processors may put objects
+ * onto the freelist but the processor that froze the slab is the only
+ * one that can retrieve the objects from the page's freelist.
+ *
+ * The list_lock protects the partial and full list on each node and
+ * the partial slab counter. If taken then no new slabs may be added or
+ * removed from the lists nor make the number of partial slabs be modified.
+ * (Note that the total number of slabs is an atomic value that may be
+ * modified without taking the list lock).
+ *
+ * The list_lock is a centralized lock and thus we avoid taking it as
+ * much as possible. As long as SLUB does not have to handle partial
+ * slabs, operations can continue without any centralized lock. F.e.
+ * allocating a long series of objects that fill up slabs does not require
+ * the list lock.
+ * Interrupts are disabled during allocation and deallocation in order to
+ * make the slab allocator safe to use in the context of an irq. In addition
+ * interrupts are disabled to ensure that the processor does not change
+ * while handling per_cpu slabs, due to kernel preemption.
+ *
+ * SLUB assigns one slab for allocation to each processor.
+ * Allocations only occur from these slabs called cpu slabs.
+ *
+ * Slabs with free elements are kept on a partial list and during regular
+ * operations no list for full slabs is used. If an object in a full slab is
+ * freed then the slab will show up again on the partial lists.
+ * We track full slabs for debugging purposes though because otherwise we
+ * cannot scan all objects.
+ *
+ * Slabs are freed when they become empty. Teardown and setup is
+ * minimal so we rely on the page allocators per cpu caches for
+ * fast frees and allocs.
+ *
+ * Overloading of page flags that are otherwise used for LRU management.
+ *
+ * PageActive The slab is frozen and exempt from list processing.
+ * This means that the slab is dedicated to a purpose
+ * such as satisfying allocations for a specific
+ * processor. Objects may be freed in the slab while
+ * it is frozen but slab_free will then skip the usual
+ * list operations. It is up to the processor holding
+ * the slab to integrate the slab into the slab lists
+ * when the slab is no longer needed.
+ *
+ * One use of this flag is to mark slabs that are
+ * used for allocations. Then such a slab becomes a cpu
+ * slab. The cpu slab may be equipped with an additional
+ * freelist that allows lockless access to
+ * free objects in addition to the regular freelist
+ * that requires the slab lock.
+ *
+ * PageError Slab requires special handling due to debug
+ * options set. This moves slab handling out of
+ * the fast path and disables lockless freelists.
+ */
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#else
+ return 0;
+#endif
+}
+
+static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ return !kmem_cache_debug(s);
+#else
+ return false;
+#endif
+}
+
+/*
+ * Issues still to be resolved:
+ *
+ * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
+ *
+ * - Variable sizing of the per node arrays
+ */
+
+/* Enable to test recovery from slab corruption on boot */
+#undef SLUB_RESILIENCY_TEST
+
+/* Enable to log cmpxchg failures */
+#undef SLUB_DEBUG_CMPXCHG
+
+/*
+ * Mininum number of partial slabs. These will be left on the partial
+ * lists even if they are empty. kmem_cache_shrink may reclaim them.
+ */
+#define MIN_PARTIAL 5
+
+/*
+ * Maximum number of desirable partial slabs.
+ * The existence of more partial slabs makes kmem_cache_shrink
+ * sort the partial list by the number of objects in use.
+ */
+#define MAX_PARTIAL 10
+
+#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+ SLAB_POISON | SLAB_STORE_USER)
+
+/*
+ * Debugging flags that require metadata to be stored in the slab. These get
+ * disabled when slub_debug=O is used and a cache's min order increases with
+ * metadata.
+ */
+#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+
+/*
+ * Set of flags that will prevent slab merging
+ */
+#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+ SLAB_FAILSLAB)
+
+#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_CACHE_DMA | SLAB_NOTRACK)
+
+#define OO_SHIFT 16
+#define OO_MASK ((1 << OO_SHIFT) - 1)
+#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
+
+/* Internal SLUB flags */
+#define __OBJECT_POISON 0x80000000UL /* Poison object */
+#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
+
+#ifdef CONFIG_SMP
+static struct notifier_block slab_notifier;
+#endif
+
+/*
+ * Tracking user of a slab.
+ */
+#define TRACK_ADDRS_COUNT 16
+struct track {
+ unsigned long addr; /* Called from address */
+#ifdef CONFIG_STACKTRACE
+ unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#endif
+ int cpu; /* Was running on cpu */
+ int pid; /* Pid context */
+ unsigned long when; /* When did the operation occur */
+};
+
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+
+#ifdef CONFIG_SYSFS
+static int sysfs_slab_add(struct kmem_cache *);
+static int sysfs_slab_alias(struct kmem_cache *, const char *);
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
+#else
+static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
+ { return 0; }
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
+#endif
+
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
+{
+#ifdef CONFIG_SLUB_STATS
+ /*
+ * The rmw is racy on a preemptible kernel but this is acceptable, so
+ * avoid this_cpu_add()'s irq-disable overhead.
+ */
+ raw_cpu_inc(s->cpu_slab->stat[si]);
+#endif
+}
+
+/********************************************************************
+ * Core slab cache functions
+ *******************************************************************/
+
+static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+{
+ return s->node[node];
+}
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, const void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+ return *(void **)(object + s->offset);
+}
+
+static void prefetch_freepointer(const struct kmem_cache *s, void *object)
+{
+ prefetch(object + s->offset);
+}
+
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+ void *p;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+#else
+ p = get_freepointer(s, object);
+#endif
+ return p;
+}
+
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+ *(void **)(object + s->offset) = fp;
+}
+
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr, __objects) \
+ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
+ __p += (__s)->size)
+
+/* Determine object index from a given position */
+static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
+{
+ return (p - addr) / s->size;
+}
+
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+
+#endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+ return ((PAGE_SIZE << order) - reserved) / size;
+}
+
+static inline struct kmem_cache_order_objects oo_make(int order,
+ unsigned long size, int reserved)
+{
+ struct kmem_cache_order_objects x = {
+ (order << OO_SHIFT) + order_objects(order, size, reserved)
+ };
+
+ return x;
+}
+
+static inline int oo_order(struct kmem_cache_order_objects x)
+{
+ return x.x >> OO_SHIFT;
+}
+
+static inline int oo_objects(struct kmem_cache_order_objects x)
+{
+ return x.x & OO_MASK;
+}
+
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+ bit_spin_lock(PG_locked, &page->flags);
+}
+
+static __always_inline void slab_unlock(struct page *page)
+{
+ __bit_spin_unlock(PG_locked, &page->flags);
+}
+
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+ struct page tmp;
+ tmp.counters = counters_new;
+ /*
+ * page->counters can cover frozen/inuse/objects as well
+ * as page->_count. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_count, so
+ * be careful and only assign to the fields we need.
+ */
+ page->frozen = tmp.frozen;
+ page->inuse = tmp.inuse;
+ page->objects = tmp.objects;
+}
+
+/* Interrupts must be disabled (for the fallback code to work right) */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+ VM_BUG_ON(!irqs_disabled());
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist, &page->counters,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return 1;
+ } else
+#endif
+ {
+ slab_lock(page);
+ if (page->freelist == freelist_old &&
+ page->counters == counters_old) {
+ page->freelist = freelist_new;
+ set_page_slub_counters(page, counters_new);
+ slab_unlock(page);
+ return 1;
+ }
+ slab_unlock(page);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return 0;
+}
+
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new,
+ const char *n)
+{
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (s->flags & __CMPXCHG_DOUBLE) {
+ if (cmpxchg_double(&page->freelist, &page->counters,
+ freelist_old, counters_old,
+ freelist_new, counters_new))
+ return 1;
+ } else
+#endif
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ slab_lock(page);
+ if (page->freelist == freelist_old &&
+ page->counters == counters_old) {
+ page->freelist = freelist_new;
+ set_page_slub_counters(page, counters_new);
+ slab_unlock(page);
+ local_irq_restore(flags);
+ return 1;
+ }
+ slab_unlock(page);
+ local_irq_restore(flags);
+ }
+
+ cpu_relax();
+ stat(s, CMPXCHG_DOUBLE_FAIL);
+
+#ifdef SLUB_DEBUG_CMPXCHG
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
+#endif
+
+ return 0;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+/*
+ * Determine a map of object in use on a page.
+ *
+ * Node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(slab_index(p, s, addr), map);
+}
+
+/*
+ * Debug settings:
+ */
+#ifdef CONFIG_SLUB_DEBUG_ON
+static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#else
+static int slub_debug;
+#endif
+
+static char *slub_debug_slabs;
+static int disable_higher_order_debug;
+
+/*
+ * Object debugging
+ */
+static void print_section(char *text, u8 *addr, unsigned int length)
+{
+ print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ length, 1);
+}
+
+static struct track *get_track(struct kmem_cache *s, void *object,
+ enum track_item alloc)
+{
+ struct track *p;
+
+ if (s->offset)
+ p = object + s->offset + sizeof(void *);
+ else
+ p = object + s->inuse;
+
+ return p + alloc;
+}
+
+static void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr)
+{
+ struct track *p = get_track(s, object, alloc);
+
+ if (addr) {
+#ifdef CONFIG_STACKTRACE
+ struct stack_trace trace;
+ int i;
+
+ trace.nr_entries = 0;
+ trace.max_entries = TRACK_ADDRS_COUNT;
+ trace.entries = p->addrs;
+ trace.skip = 3;
+ save_stack_trace(&trace);
+
+ /* See rant in lockdep.c */
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries - 1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
+ p->addrs[i] = 0;
+#endif
+ p->addr = addr;
+ p->cpu = smp_processor_id();
+ p->pid = current->pid;
+ p->when = jiffies;
+ } else
+ memset(p, 0, sizeof(struct track));
+}
+
+static void init_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ set_track(s, object, TRACK_FREE, 0UL);
+ set_track(s, object, TRACK_ALLOC, 0UL);
+}
+
+static void print_track(const char *s, struct track *t)
+{
+ if (!t->addr)
+ return;
+
+ pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+#ifdef CONFIG_STACKTRACE
+ {
+ int i;
+ for (i = 0; i < TRACK_ADDRS_COUNT; i++)
+ if (t->addrs[i])
+ pr_err("\t%pS\n", (void *)t->addrs[i]);
+ else
+ break;
+ }
+#endif
+}
+
+static void print_tracking(struct kmem_cache *s, void *object)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ print_track("Allocated", get_track(s, object, TRACK_ALLOC));
+ print_track("Freed", get_track(s, object, TRACK_FREE));
+}
+
+static void print_page_info(struct page *page)
+{
+ pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ page, page->objects, page->inuse, page->freelist, page->flags);
+
+}
+
+static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("=============================================================================\n");
+ pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("-----------------------------------------------------------------------------\n\n");
+
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ va_end(args);
+}
+
+static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("FIX %s: %pV\n", s->name, &vaf);
+ va_end(args);
+}
+
+static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned int off; /* Offset of last byte */
+ u8 *addr = page_address(page);
+
+ print_tracking(s, p);
+
+ print_page_info(page);
+
+ pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
+
+ if (p > addr + 16)
+ print_section("Bytes b4 ", p - 16, 16);
+
+ print_section("Object ", p, min_t(unsigned long, s->object_size,
+ PAGE_SIZE));
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p + s->object_size,
+ s->inuse - s->object_size);
+
+ if (s->offset)
+ off = s->offset + sizeof(void *);
+ else
+ off = s->inuse;
+
+ if (s->flags & SLAB_STORE_USER)
+ off += 2 * sizeof(struct track);
+
+ if (off != s->size)
+ /* Beginning of the filler is the free pointer */
+ print_section("Padding ", p + off, s->size - off);
+
+ dump_stack();
+}
+
+static void object_err(struct kmem_cache *s, struct page *page,
+ u8 *object, char *reason)
+{
+ slab_bug(s, "%s", reason);
+ print_trailer(s, page, object);
+}
+
+static void slab_err(struct kmem_cache *s, struct page *page,
+ const char *fmt, ...)
+{
+ va_list args;
+ char buf[100];
+
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+ slab_bug(s, "%s", buf);
+ print_page_info(page);
+ dump_stack();
+}
+
+static void init_object(struct kmem_cache *s, void *object, u8 val)
+{
+ u8 *p = object;
+
+ if (s->flags & __OBJECT_POISON) {
+ memset(p, POISON_FREE, s->object_size - 1);
+ p[s->object_size - 1] = POISON_END;
+ }
+
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p + s->object_size, val, s->inuse - s->object_size);
+}
+
+static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+ void *from, void *to)
+{
+ slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ memset(from, data, to - from);
+}
+
+static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+ u8 *object, char *what,
+ u8 *start, unsigned int value, unsigned int bytes)
+{
+ u8 *fault;
+ u8 *end;
+
+ fault = memchr_inv(start, value, bytes);
+ if (!fault)
+ return 1;
+
+ end = start + bytes;
+ while (end > fault && end[-1] == value)
+ end--;
+
+ slab_bug(s, "%s overwritten", what);
+ pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ fault, end - 1, fault[0], value);
+ print_trailer(s, page, object);
+
+ restore_bytes(s, what, value, fault, end);
+ return 0;
+}
+
+/*
+ * Object layout:
+ *
+ * object address
+ * Bytes of the object to be managed.
+ * If the freepointer may overlay the object then the free
+ * pointer is the first word of the object.
+ *
+ * Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ * 0xa5 (POISON_END)
+ *
+ * object + s->object_size
+ * Padding to reach word boundary. This is also used for Redzoning.
+ * Padding is extended by another word if Redzoning is enabled and
+ * object_size == inuse.
+ *
+ * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
+ * 0xcc (RED_ACTIVE) for objects in use.
+ *
+ * object + s->inuse
+ * Meta data starts here.
+ *
+ * A. Free pointer (if we cannot overwrite object on free)
+ * B. Tracking data for SLAB_STORE_USER
+ * C. Padding to reach required alignment boundary or at mininum
+ * one word if debugging is on to be able to detect writes
+ * before the word boundary.
+ *
+ * Padding is done using 0x5a (POISON_INUSE)
+ *
+ * object + s->size
+ * Nothing is used beyond s->size.
+ *
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
+ * ignored. And therefore no slab options that rely on these boundaries
+ * may be used with merged slabcaches.
+ */
+
+static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
+{
+ unsigned long off = s->inuse; /* The end of info */
+
+ if (s->offset)
+ /* Freepointer is placed after the object. */
+ off += sizeof(void *);
+
+ if (s->flags & SLAB_STORE_USER)
+ /* We also have user information there */
+ off += 2 * sizeof(struct track);
+
+ if (s->size == off)
+ return 1;
+
+ return check_bytes_and_report(s, page, p, "Object padding",
+ p + off, POISON_INUSE, s->size - off);
+}
+
+/* Check the pad bytes at the end of a slab page */
+static int slab_pad_check(struct kmem_cache *s, struct page *page)
+{
+ u8 *start;
+ u8 *fault;
+ u8 *end;
+ int length;
+ int remainder;
+
+ if (!(s->flags & SLAB_POISON))
+ return 1;
+
+ start = page_address(page);
+ length = (PAGE_SIZE << compound_order(page)) - s->reserved;
+ end = start + length;
+ remainder = length % s->size;
+ if (!remainder)
+ return 1;
+
+ fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ if (!fault)
+ return 1;
+ while (end > fault && end[-1] == POISON_INUSE)
+ end--;
+
+ slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
+ print_section("Padding ", end - remainder, remainder);
+
+ restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
+ return 0;
+}
+
+static int check_object(struct kmem_cache *s, struct page *page,
+ void *object, u8 val)
+{
+ u8 *p = object;
+ u8 *endobject = object + s->object_size;
+
+ if (s->flags & SLAB_RED_ZONE) {
+ if (!check_bytes_and_report(s, page, object, "Redzone",
+ endobject, val, s->inuse - s->object_size))
+ return 0;
+ } else {
+ if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
+ check_bytes_and_report(s, page, p, "Alignment padding",
+ endobject, POISON_INUSE,
+ s->inuse - s->object_size);
+ }
+ }
+
+ if (s->flags & SLAB_POISON) {
+ if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
+ (!check_bytes_and_report(s, page, p, "Poison", p,
+ POISON_FREE, s->object_size - 1) ||
+ !check_bytes_and_report(s, page, p, "Poison",
+ p + s->object_size - 1, POISON_END, 1)))
+ return 0;
+ /*
+ * check_pad_bytes cleans up on its own.
+ */
+ check_pad_bytes(s, page, p);
+ }
+
+ if (!s->offset && val == SLUB_RED_ACTIVE)
+ /*
+ * Object and freepointer overlap. Cannot check
+ * freepointer while object is allocated.
+ */
+ return 1;
+
+ /* Check free pointer validity */
+ if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
+ object_err(s, page, p, "Freepointer corrupt");
+ /*
+ * No choice but to zap it and thus lose the remainder
+ * of the free objects in this slab. May cause
+ * another error because the object count is now wrong.
+ */
+ set_freepointer(s, p, NULL);
+ return 0;
+ }
+ return 1;
+}
+
+static int check_slab(struct kmem_cache *s, struct page *page)
+{
+ int maxobj;
+
+ VM_BUG_ON(!irqs_disabled());
+
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Not a valid slab page");
+ return 0;
+ }
+
+ maxobj = order_objects(compound_order(page), s->size, s->reserved);
+ if (page->objects > maxobj) {
+ slab_err(s, page, "objects %u > max %u",
+ s->name, page->objects, maxobj);
+ return 0;
+ }
+ if (page->inuse > page->objects) {
+ slab_err(s, page, "inuse %u > max %u",
+ s->name, page->inuse, page->objects);
+ return 0;
+ }
+ /* Slab_pad_check fixes things up after itself */
+ slab_pad_check(s, page);
+ return 1;
+}
+
+/*
+ * Determine if a certain object on a page is on the freelist. Must hold the
+ * slab lock to guarantee that the chains are in a consistent state.
+ */
+static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
+{
+ int nr = 0;
+ void *fp;
+ void *object = NULL;
+ unsigned long max_objects;
+
+ fp = page->freelist;
+ while (fp && nr <= page->objects) {
+ if (fp == search)
+ return 1;
+ if (!check_valid_pointer(s, page, fp)) {
+ if (object) {
+ object_err(s, page, object,
+ "Freechain corrupt");
+ set_freepointer(s, object, NULL);
+ } else {
+ slab_err(s, page, "Freepointer corrupt");
+ page->freelist = NULL;
+ page->inuse = page->objects;
+ slab_fix(s, "Freelist cleared");
+ return 0;
+ }
+ break;
+ }
+ object = fp;
+ fp = get_freepointer(s, object);
+ nr++;
+ }
+
+ max_objects = order_objects(compound_order(page), s->size, s->reserved);
+ if (max_objects > MAX_OBJS_PER_PAGE)
+ max_objects = MAX_OBJS_PER_PAGE;
+
+ if (page->objects != max_objects) {
+ slab_err(s, page, "Wrong number of objects. Found %d but "
+ "should be %d", page->objects, max_objects);
+ page->objects = max_objects;
+ slab_fix(s, "Number of objects adjusted.");
+ }
+ if (page->inuse != page->objects - nr) {
+ slab_err(s, page, "Wrong object count. Counter is %d but "
+ "counted were %d", page->inuse, page->objects - nr);
+ page->inuse = page->objects - nr;
+ slab_fix(s, "Object count adjusted.");
+ }
+ return search == NULL;
+}
+
+static void trace(struct kmem_cache *s, struct page *page, void *object,
+ int alloc)
+{
+ if (s->flags & SLAB_TRACE) {
+ pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ s->name,
+ alloc ? "alloc" : "free",
+ object, page->inuse,
+ page->freelist);
+
+ if (!alloc)
+ print_section("Object ", (void *)object,
+ s->object_size);
+
+ dump_stack();
+ }
+}
+
+/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+{
+ kmemleak_alloc(ptr, size, 1, flags);
+}
+
+static inline void kfree_hook(const void *x)
+{
+ kmemleak_free(x);
+}
+
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(flags & __GFP_WAIT);
+
+ return should_failslab(s->object_size, flags, s->flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ gfp_t flags, void *object)
+{
+ flags &= gfp_allowed_mask;
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+
+ /*
+ * Trouble is that we may no longer disable interrupts in the fast path
+ * So in order to make the debug calls that expect irqs to be
+ * disabled we need to disable interrupts temporarily.
+ */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kmemcheck_slab_free(s, x, s->object_size);
+ debug_check_no_locks_freed(x, s->object_size);
+ local_irq_restore(flags);
+ }
+#endif
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(x, s->object_size);
+}
+
+/*
+ * Tracking of fully allocated slabs for debugging purposes.
+ */
+static void add_full(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ lockdep_assert_held(&n->list_lock);
+ list_add(&page->lru, &n->full);
+}
+
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return;
+
+ lockdep_assert_held(&n->list_lock);
+ list_del(&page->lru);
+}
+
+/* Tracking of the number of slabs for debugging purposes */
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
+{
+ return atomic_long_read(&n->nr_slabs);
+}
+
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (likely(n)) {
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
+ }
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+ atomic_long_sub(objects, &n->total_objects);
+}
+
+/* Object debug checks for alloc/free paths */
+static void setup_object_debug(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ return;
+
+ init_object(s, object, SLUB_RED_INACTIVE);
+ init_tracking(s, object);
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (!check_slab(s, page))
+ goto bad;
+
+ if (!check_valid_pointer(s, page, object)) {
+ object_err(s, page, object, "Freelist Pointer check fails");
+ goto bad;
+ }
+
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
+ goto bad;
+
+ /* Success perform special debug activities for allocs */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_ALLOC, addr);
+ trace(s, page, object, 1);
+ init_object(s, object, SLUB_RED_ACTIVE);
+ return 1;
+
+bad:
+ if (PageSlab(page)) {
+ /*
+ * If this is a slab page then lets do the best we can
+ * to avoid issues in the future. Marking all objects
+ * as used avoids touching the remaining objects.
+ */
+ slab_fix(s, "Marking all objects used");
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ }
+ return 0;
+}
+
+static noinline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+
+ spin_lock_irqsave(&n->list_lock, *flags);
+ slab_lock(page);
+
+ if (!check_slab(s, page))
+ goto fail;
+
+ if (!check_valid_pointer(s, page, object)) {
+ slab_err(s, page, "Invalid object pointer 0x%p", object);
+ goto fail;
+ }
+
+ if (on_freelist(s, page, object)) {
+ object_err(s, page, object, "Object already free");
+ goto fail;
+ }
+
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
+ goto out;
+
+ if (unlikely(s != page->slab_cache)) {
+ if (!PageSlab(page)) {
+ slab_err(s, page, "Attempt to free object(0x%p) "
+ "outside of slab", object);
+ } else if (!page->slab_cache) {
+ pr_err("SLUB <none>: no slab for object 0x%p.\n",
+ object);
+ dump_stack();
+ } else
+ object_err(s, page, object,
+ "page slab pointer corrupt.");
+ goto fail;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, object, TRACK_FREE, addr);
+ trace(s, page, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
+out:
+ slab_unlock(page);
+ /*
+ * Keep node_lock to preserve integrity
+ * until the object is actually freed
+ */
+ return n;
+
+fail:
+ slab_unlock(page);
+ spin_unlock_irqrestore(&n->list_lock, *flags);
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ if (*str == ',')
+ /*
+ * No options but restriction on slabs. This means full
+ * debugging for slabs matching a pattern.
+ */
+ goto check_slabs;
+
+ if (tolower(*str) == 'o') {
+ /*
+ * Avoid enabling debugging on caches if its minimum order
+ * would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ goto out;
+ }
+
+ slub_debug = 0;
+ if (*str == '-')
+ /*
+ * Switch off all debugging measures.
+ */
+ goto out;
+
+ /*
+ * Determine which debug features should be switched on
+ */
+ for (; *str && *str != ','; str++) {
+ switch (tolower(*str)) {
+ case 'f':
+ slub_debug |= SLAB_DEBUG_FREE;
+ break;
+ case 'z':
+ slub_debug |= SLAB_RED_ZONE;
+ break;
+ case 'p':
+ slub_debug |= SLAB_POISON;
+ break;
+ case 'u':
+ slub_debug |= SLAB_STORE_USER;
+ break;
+ case 't':
+ slub_debug |= SLAB_TRACE;
+ break;
+ case 'a':
+ slub_debug |= SLAB_FAILSLAB;
+ break;
+ default:
+ pr_err("slub_debug option '%c' unknown. skipped\n",
+ *str);
+ }
+ }
+
+check_slabs:
+ if (*str == ',')
+ slub_debug_slabs = str + 1;
+out:
+ return 1;
+}
+
+__setup("slub_debug", setup_slub_debug);
+
+static unsigned long kmem_cache_flags(unsigned long object_size,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ /*
+ * Enable debugging if selected on the kernel commandline.
+ */
+ if (slub_debug && (!slub_debug_slabs || (name &&
+ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
+ flags |= slub_debug;
+
+ return flags;
+}
+#else
+static inline void setup_object_debug(struct kmem_cache *s,
+ struct page *page, void *object) {}
+
+static inline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr) { return 0; }
+
+static inline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags) { return NULL; }
+
+static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+ { return 1; }
+static inline int check_object(struct kmem_cache *s, struct page *page,
+ void *object, u8 val) { return 1; }
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ return flags;
+}
+#define slub_debug 0
+
+#define disable_higher_order_debug 0
+
+static inline unsigned long slabs_node(struct kmem_cache *s, int node)
+ { return 0; }
+static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
+ { return 0; }
+static inline void inc_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+static inline void dec_slabs_node(struct kmem_cache *s, int node,
+ int objects) {}
+
+static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+{
+ kmemleak_alloc(ptr, size, 1, flags);
+}
+
+static inline void kfree_hook(const void *x)
+{
+ kmemleak_free(x);
+}
+
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+ { return 0; }
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ void *object)
+{
+ kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
+ flags & gfp_allowed_mask);
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+}
+
+#endif /* CONFIG_SLUB_DEBUG */
+
+/*
+ * Slab allocation and freeing
+ */
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, struct kmem_cache_order_objects oo)
+{
+ struct page *page;
+ int order = oo_order(oo);
+
+ flags |= __GFP_NOTRACK;
+
+ if (memcg_charge_slab(s, flags, order))
+ return NULL;
+
+ if (node == NUMA_NO_NODE)
+ page = alloc_pages(flags, order);
+ else
+ page = alloc_pages_exact_node(node, flags, order);
+
+ if (!page)
+ memcg_uncharge_slab(s, order);
+
+ return page;
+}
+
+static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ struct kmem_cache_order_objects oo = s->oo;
+ gfp_t alloc_gfp;
+
+ flags &= gfp_allowed_mask;
+
+ if (flags & __GFP_WAIT)
+ local_irq_enable();
+
+ flags |= s->allocflags;
+
+ /*
+ * Let the initial higher-order allocation fail under memory pressure
+ * so we fall-back to the minimum order allocation.
+ */
+ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
+ if (unlikely(!page)) {
+ oo = s->min;
+ alloc_gfp = flags;
+ /*
+ * Allocation may have failed due to fragmentation.
+ * Try a lower order alloc if possible
+ */
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
+
+ if (page)
+ stat(s, ORDER_FALLBACK);
+ }
+
+ if (kmemcheck_enabled && page
+ && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+ int pages = 1 << oo_order(oo);
+
+ kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
+
+ /*
+ * Objects from caches that have a constructor don't get
+ * cleared when they're allocated, so we need to do it here.
+ */
+ if (s->ctor)
+ kmemcheck_mark_uninitialized_pages(page, pages);
+ else
+ kmemcheck_mark_unallocated_pages(page, pages);
+ }
+
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
+ if (!page)
+ return NULL;
+
+ page->objects = oo_objects(oo);
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ 1 << oo_order(oo));
+
+ return page;
+}
+
+static void setup_object(struct kmem_cache *s, struct page *page,
+ void *object)
+{
+ setup_object_debug(s, page, object);
+ if (unlikely(s->ctor))
+ s->ctor(object);
+}
+
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page;
+ void *start;
+ void *last;
+ void *p;
+ int order;
+
+ BUG_ON(flags & GFP_SLAB_BUG_MASK);
+
+ page = allocate_slab(s,
+ flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+ if (!page)
+ goto out;
+
+ order = compound_order(page);
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+ page->slab_cache = s;
+ __SetPageSlab(page);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page);
+
+ start = page_address(page);
+
+ if (unlikely(s->flags & SLAB_POISON))
+ memset(start, POISON_INUSE, PAGE_SIZE << order);
+
+ last = start;
+ for_each_object(p, s, start, page->objects) {
+ setup_object(s, page, last);
+ set_freepointer(s, last, p);
+ last = p;
+ }
+ setup_object(s, page, last);
+ set_freepointer(s, last, NULL);
+
+ page->freelist = start;
+ page->inuse = page->objects;
+ page->frozen = 1;
+out:
+ return page;
+}
+
+static void __free_slab(struct kmem_cache *s, struct page *page)
+{
+ int order = compound_order(page);
+ int pages = 1 << order;
+
+ if (kmem_cache_debug(s)) {
+ void *p;
+
+ slab_pad_check(s, page);
+ for_each_object(p, s, page_address(page),
+ page->objects)
+ check_object(s, page, p, SLUB_RED_INACTIVE);
+ }
+
+ kmemcheck_free_shadow(page, compound_order(page));
+
+ mod_zone_page_state(page_zone(page),
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+ -pages);
+
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+
+ page_mapcount_reset(page);
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += pages;
+ __free_pages(page, order);
+ memcg_uncharge_slab(s, order);
+}
+
+#define need_reserve_slab_rcu \
+ (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
+static void rcu_free_slab(struct rcu_head *h)
+{
+ struct page *page;
+
+ if (need_reserve_slab_rcu)
+ page = virt_to_head_page(h);
+ else
+ page = container_of((struct list_head *)h, struct page, lru);
+
+ __free_slab(page->slab_cache, page);
+}
+
+static void free_slab(struct kmem_cache *s, struct page *page)
+{
+ if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
+ struct rcu_head *head;
+
+ if (need_reserve_slab_rcu) {
+ int order = compound_order(page);
+ int offset = (PAGE_SIZE << order) - s->reserved;
+
+ VM_BUG_ON(s->reserved != sizeof(*head));
+ head = page_address(page) + offset;
+ } else {
+ /*
+ * RCU free overloads the RCU head over the LRU
+ */
+ head = (void *)&page->lru;
+ }
+
+ call_rcu(head, rcu_free_slab);
+ } else
+ __free_slab(s, page);
+}
+
+static void discard_slab(struct kmem_cache *s, struct page *page)
+{
+ dec_slabs_node(s, page_to_nid(page), page->objects);
+ free_slab(s, page);
+}
+
+/*
+ * Management of partially allocated slabs.
+ */
+static inline void
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
+{
+ n->nr_partial++;
+ if (tail == DEACTIVATE_TO_TAIL)
+ list_add_tail(&page->lru, &n->partial);
+ else
+ list_add(&page->lru, &n->partial);
+}
+
+static inline void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
+{
+ lockdep_assert_held(&n->list_lock);
+ __add_partial(n, page, tail);
+}
+
+static inline void
+__remove_partial(struct kmem_cache_node *n, struct page *page)
+{
+ list_del(&page->lru);
+ n->nr_partial--;
+}
+
+static inline void remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ lockdep_assert_held(&n->list_lock);
+ __remove_partial(n, page);
+}
+
+/*
+ * Remove slab from the partial list, freeze it and
+ * return the pointer to the freelist.
+ *
+ * Returns a list of objects or NULL if it fails.
+ */
+static inline void *acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode, int *objects)
+{
+ void *freelist;
+ unsigned long counters;
+ struct page new;
+
+ lockdep_assert_held(&n->list_lock);
+
+ /*
+ * Zap the freelist and set the frozen bit.
+ * The old freelist is the list of objects for the
+ * per cpu allocation list.
+ */
+ freelist = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ *objects = new.objects - new.inuse;
+ if (mode) {
+ new.inuse = page->objects;
+ new.freelist = NULL;
+ } else {
+ new.freelist = freelist;
+ }
+
+ VM_BUG_ON(new.frozen);
+ new.frozen = 1;
+
+ if (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ new.freelist, new.counters,
+ "acquire_slab"))
+ return NULL;
+
+ remove_partial(n, page);
+ WARN_ON(!freelist);
+ return freelist;
+}
+
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
+
+/*
+ * Try to allocate a partial slab from a specific node.
+ */
+static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct kmem_cache_cpu *c, gfp_t flags)
+{
+ struct page *page, *page2;
+ void *object = NULL;
+ int available = 0;
+ int objects;
+
+ /*
+ * Racy check. If we mistakenly see no partial slabs then we
+ * just allocate an empty slab. If we mistakenly try to get a
+ * partial slab and there is none available then get_partials()
+ * will return NULL.
+ */
+ if (!n || !n->nr_partial)
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ void *t;
+
+ if (!pfmemalloc_match(page, flags))
+ continue;
+
+ t = acquire_slab(s, n, page, object == NULL, &objects);
+ if (!t)
+ break;
+
+ available += objects;
+ if (!object) {
+ c->page = page;
+ stat(s, ALLOC_FROM_PARTIAL);
+ object = t;
+ } else {
+ put_cpu_partial(s, page, 0);
+ stat(s, CPU_PARTIAL_NODE);
+ }
+ if (!kmem_cache_has_cpu_partial(s)
+ || available > s->cpu_partial / 2)
+ break;
+
+ }
+ spin_unlock(&n->list_lock);
+ return object;
+}
+
+/*
+ * Get a page from somewhere. Search in increasing NUMA distances.
+ */
+static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ struct kmem_cache_cpu *c)
+{
+#ifdef CONFIG_NUMA
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type high_zoneidx = gfp_zone(flags);
+ void *object;
+ unsigned int cpuset_mems_cookie;
+
+ /*
+ * The defrag ratio allows a configuration of the tradeoffs between
+ * inter node defragmentation and node local allocations. A lower
+ * defrag_ratio increases the tendency to do local allocations
+ * instead of attempting to obtain partial slabs from other nodes.
+ *
+ * If the defrag_ratio is set to 0 then kmalloc() always
+ * returns node local objects. If the ratio is higher then kmalloc()
+ * may return off node objects because partial slabs are obtained
+ * from other nodes and filled up.
+ *
+ * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
+ * defrag_ratio = 1000) then every (well almost) allocation will
+ * first attempt to defrag slab caches on other nodes. This means
+ * scanning over all nodes to look for partial slabs which may be
+ * expensive if we do it every time we are trying to find a slab
+ * with available objects.
+ */
+ if (!s->remote_node_defrag_ratio ||
+ get_cycles() % 1024 > s->remote_node_defrag_ratio)
+ return NULL;
+
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ n->nr_partial > s->min_partial) {
+ object = get_partial_node(s, n, c, flags);
+ if (object) {
+ /*
+ * Don't check read_mems_allowed_retry()
+ * here - if mems_allowed was updated in
+ * parallel, that was a harmless race
+ * between allocation and the cpuset
+ * update
+ */
+ return object;
+ }
+ }
+ }
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+#endif
+ return NULL;
+}
+
+/*
+ * Get a partial page, lock it and return it.
+ */
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+ struct kmem_cache_cpu *c)
+{
+ void *object;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
+
+ object = get_partial_node(s, get_node(s, searchnode), c, flags);
+ if (object || node != NUMA_NO_NODE)
+ return object;
+
+ return get_any_partial(s, flags, c);
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+ return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+ return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+ return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+ return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+ const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ pr_info("%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+ if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ pr_warn("due to cpu change %d -> %d\n",
+ tid_to_cpu(tid), tid_to_cpu(actual_tid));
+ else
+#endif
+ if (tid_to_event(tid) != tid_to_event(actual_tid))
+ pr_warn("due to cpu running other code. Event %ld->%ld\n",
+ tid_to_event(tid), tid_to_event(actual_tid));
+ else
+ pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ actual_tid, tid, next_tid(tid));
+#endif
+ stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
+}
+
+static void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+}
+
+/*
+ * Remove the cpu slab
+ */
+static void deactivate_slab(struct kmem_cache *s, struct page *page,
+ void *freelist)
+{
+ enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ int lock = 0;
+ enum slab_modes l = M_NONE, m = M_NONE;
+ void *nextfree;
+ int tail = DEACTIVATE_TO_HEAD;
+ struct page new;
+ struct page old;
+
+ if (page->freelist) {
+ stat(s, DEACTIVATE_REMOTE_FREES);
+ tail = DEACTIVATE_TO_TAIL;
+ }
+
+ /*
+ * Stage one: Free all available per cpu objects back
+ * to the page freelist while it is still frozen. Leave the
+ * last one.
+ *
+ * There is no need to take the list->lock because the page
+ * is still frozen.
+ */
+ while (freelist && (nextfree = get_freepointer(s, freelist))) {
+ void *prior;
+ unsigned long counters;
+
+ do {
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, freelist, prior);
+ new.counters = counters;
+ new.inuse--;
+ VM_BUG_ON(!new.frozen);
+
+ } while (!__cmpxchg_double_slab(s, page,
+ prior, counters,
+ freelist, new.counters,
+ "drain percpu freelist"));
+
+ freelist = nextfree;
+ }
+
+ /*
+ * Stage two: Ensure that the page is unfrozen while the
+ * list presence reflects the actual number of objects
+ * during unfreeze.
+ *
+ * We setup the list membership and then perform a cmpxchg
+ * with the count. If there is a mismatch then the page
+ * is not unfrozen but the page is on the wrong list.
+ *
+ * Then we restart the process which may have to remove
+ * the page from the list that we just put it on again
+ * because the number of objects in the slab may have
+ * changed.
+ */
+redo:
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ /* Determine target state of the slab */
+ new.counters = old.counters;
+ if (freelist) {
+ new.inuse--;
+ set_freepointer(s, freelist, old.freelist);
+ new.freelist = freelist;
+ } else
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ if (!new.inuse && n->nr_partial >= s->min_partial)
+ m = M_FREE;
+ else if (new.freelist) {
+ m = M_PARTIAL;
+ if (!lock) {
+ lock = 1;
+ /*
+ * Taking the spinlock removes the possiblity
+ * that acquire_slab() will see a slab page that
+ * is frozen
+ */
+ spin_lock(&n->list_lock);
+ }
+ } else {
+ m = M_FULL;
+ if (kmem_cache_debug(s) && !lock) {
+ lock = 1;
+ /*
+ * This also ensures that the scanning of full
+ * slabs from diagnostic functions will not see
+ * any frozen slabs.
+ */
+ spin_lock(&n->list_lock);
+ }
+ }
+
+ if (l != m) {
+
+ if (l == M_PARTIAL)
+
+ remove_partial(n, page);
+
+ else if (l == M_FULL)
+
+ remove_full(s, n, page);
+
+ if (m == M_PARTIAL) {
+
+ add_partial(n, page, tail);
+ stat(s, tail);
+
+ } else if (m == M_FULL) {
+
+ stat(s, DEACTIVATE_FULL);
+ add_full(s, n, page);
+
+ }
+ }
+
+ l = m;
+ if (!__cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"))
+ goto redo;
+
+ if (lock)
+ spin_unlock(&n->list_lock);
+
+ if (m == M_FREE) {
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+}
+
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
+ */
+static void unfreeze_partials(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct kmem_cache_node *n = NULL, *n2 = NULL;
+ struct page *page, *discard_page = NULL;
+
+ while ((page = c->partial)) {
+ struct page new;
+ struct page old;
+
+ c->partial = page->next;
+
+ n2 = get_node(s, page_to_nid(page));
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
+
+ do {
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ new.counters = old.counters;
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"));
+
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
+ page->next = discard_page;
+ discard_page = page;
+ } else {
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ }
+
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ while (discard_page) {
+ page = discard_page;
+ discard_page = discard_page->next;
+
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+#endif
+}
+
+/*
+ * Put a page that was just frozen (in __slab_free) into a partial page
+ * slot if available. This is done without interrupts disabled and without
+ * preemption disabled. The cmpxchg is racy and may put the partial page
+ * onto a random cpus partial slot.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct page *oldpage;
+ int pages;
+ int pobjects;
+
+ do {
+ pages = 0;
+ pobjects = 0;
+ oldpage = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldpage) {
+ pobjects = oldpage->pobjects;
+ pages = oldpage->pages;
+ if (drain && pobjects > s->cpu_partial) {
+ unsigned long flags;
+ /*
+ * partial array is full. Move the existing
+ * set to the per node partial list.
+ */
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ oldpage = NULL;
+ pobjects = 0;
+ pages = 0;
+ stat(s, CPU_PARTIAL_DRAIN);
+ }
+ }
+
+ pages++;
+ pobjects += page->objects - page->inuse;
+
+ page->pages = pages;
+ page->pobjects = pobjects;
+ page->next = oldpage;
+
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
+ != oldpage);
+#endif
+}
+
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+{
+ stat(s, CPUSLAB_FLUSH);
+ deactivate_slab(s, c->page, c->freelist);
+
+ c->tid = next_tid(c->tid);
+ c->page = NULL;
+ c->freelist = NULL;
+}
+
+/*
+ * Flush cpu slab.
+ *
+ * Called from IPI handler with interrupts disabled.
+ */
+static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
+{
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ if (likely(c)) {
+ if (c->page)
+ flush_slab(s, c);
+
+ unfreeze_partials(s, c);
+ }
+}
+
+static void flush_cpu_slab(void *d)
+{
+ struct kmem_cache *s = d;
+
+ __flush_cpu_slab(s, smp_processor_id());
+}
+
+static bool has_cpu_slab(int cpu, void *info)
+{
+ struct kmem_cache *s = info;
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ return c->page || c->partial;
+}
+
+static void flush_all(struct kmem_cache *s)
+{
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+}
+
+/*
+ * Check if the objects in a per cpu structure fit numa
+ * locality expectations.
+ */
+static inline int node_match(struct page *page, int node)
+{
+#ifdef CONFIG_NUMA
+ if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+ return 0;
+#endif
+ return 1;
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+static int count_free(struct page *page)
+{
+ return page->objects - page->inuse;
+}
+
+static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+{
+ return atomic_long_read(&n->total_objects);
+}
+#endif /* CONFIG_SLUB_DEBUG */
+
+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+static unsigned long count_partial(struct kmem_cache_node *n,
+ int (*get_count)(struct page *))
+{
+ unsigned long flags;
+ unsigned long x = 0;
+ struct page *page;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ x += get_count(page);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return x;
+}
+#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
+
+static noinline void
+slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ int node;
+
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
+ return;
+
+ pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+ nid, gfpflags);
+ pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
+ s->name, s->object_size, s->size, oo_order(s->oo),
+ oo_order(s->min));
+
+ if (oo_order(s->min) > get_order(s->object_size))
+ pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
+ s->name);
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+ unsigned long nr_slabs;
+ unsigned long nr_objs;
+ unsigned long nr_free;
+
+ if (!n)
+ continue;
+
+ nr_free = count_partial(n, count_free);
+ nr_slabs = node_nr_slabs(n);
+ nr_objs = node_nr_objs(n);
+
+ pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
+ node, nr_slabs, nr_objs, nr_free);
+ }
+#endif
+}
+
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+ int node, struct kmem_cache_cpu **pc)
+{
+ void *freelist;
+ struct kmem_cache_cpu *c = *pc;
+ struct page *page;
+
+ freelist = get_partial(s, flags, node, c);
+
+ if (freelist)
+ return freelist;
+
+ page = new_slab(s, flags, node);
+ if (page) {
+ c = raw_cpu_ptr(s->cpu_slab);
+ if (c->page)
+ flush_slab(s, c);
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ freelist = page->freelist;
+ page->freelist = NULL;
+
+ stat(s, ALLOC_SLAB);
+ c->page = page;
+ *pc = c;
+ } else
+ freelist = NULL;
+
+ return freelist;
+}
+
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+ if (unlikely(PageSlabPfmemalloc(page)))
+ return gfp_pfmemalloc_allowed(gfpflags);
+
+ return true;
+}
+
+/*
+ * Check the page->freelist of a page and either transfer the freelist to the
+ * per cpu freelist or deactivate the page.
+ *
+ * The page is still frozen if the return value is not NULL.
+ *
+ * If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
+ */
+static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+{
+ struct page new;
+ unsigned long counters;
+ void *freelist;
+
+ do {
+ freelist = page->freelist;
+ counters = page->counters;
+
+ new.counters = counters;
+ VM_BUG_ON(!new.frozen);
+
+ new.inuse = page->objects;
+ new.frozen = freelist != NULL;
+
+ } while (!__cmpxchg_double_slab(s, page,
+ freelist, counters,
+ NULL, new.counters,
+ "get_freelist"));
+
+ return freelist;
+}
+
+/*
+ * Slow path. The lockless freelist is empty or we need to perform
+ * debugging duties.
+ *
+ * Processing is still very fast if new objects have been freed to the
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
+ *
+ * If that is not working then we fall back to the partial lists. We take the
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
+ *
+ * And if we were unable to get a new slab from the partial slab lists then
+ * we need to allocate a new slab. This is the slowest path since it involves
+ * a call to the page allocator and the setup of a new slab.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *freelist;
+ struct page *page;
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+ page = c->page;
+ if (!page)
+ goto new_slab;
+redo:
+
+ if (unlikely(!node_match(page, node))) {
+ stat(s, ALLOC_NODE_MISMATCH);
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
+ goto new_slab;
+ }
+
+ /*
+ * By rights, we should be searching for a slab page that was
+ * PFMEMALLOC but right now, we are losing the pfmemalloc
+ * information when the page leaves the per-cpu allocator
+ */
+ if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
+ goto new_slab;
+ }
+
+ /* must check again c->freelist in case of cpu migration or IRQ */
+ freelist = c->freelist;
+ if (freelist)
+ goto load_freelist;
+
+ freelist = get_freelist(s, page);
+
+ if (!freelist) {
+ c->page = NULL;
+ stat(s, DEACTIVATE_BYPASS);
+ goto new_slab;
+ }
+
+ stat(s, ALLOC_REFILL);
+
+load_freelist:
+ /*
+ * freelist is pointing to the list of objects to be used.
+ * page is pointing to the page from which the objects are obtained.
+ * That page must be frozen for per cpu allocations to work.
+ */
+ VM_BUG_ON(!c->page->frozen);
+ c->freelist = get_freepointer(s, freelist);
+ c->tid = next_tid(c->tid);
+ local_irq_restore(flags);
+ return freelist;
+
+new_slab:
+
+ if (c->partial) {
+ page = c->page = c->partial;
+ c->partial = page->next;
+ stat(s, CPU_PARTIAL_ALLOC);
+ c->freelist = NULL;
+ goto redo;
+ }
+
+ freelist = new_slab_objects(s, gfpflags, node, &c);
+
+ if (unlikely(!freelist)) {
+ slab_out_of_memory(s, gfpflags, node);
+ local_irq_restore(flags);
+ return NULL;
+ }
+
+ page = c->page;
+ if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
+ goto load_freelist;
+
+ /* Only entered in the debug case */
+ if (kmem_cache_debug(s) &&
+ !alloc_debug_processing(s, page, freelist, addr))
+ goto new_slab; /* Slab failed checks. Next slab needed */
+
+ deactivate_slab(s, page, get_freepointer(s, freelist));
+ c->page = NULL;
+ c->freelist = NULL;
+ local_irq_restore(flags);
+ return freelist;
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
+ gfp_t gfpflags, int node, unsigned long addr)
+{
+ void **object;
+ struct kmem_cache_cpu *c;
+ struct page *page;
+ unsigned long tid;
+
+ if (slab_pre_alloc_hook(s, gfpflags))
+ return NULL;
+
+ s = memcg_kmem_get_cache(s, gfpflags);
+redo:
+ /*
+ * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+ * enabled. We may switch back and forth between cpus while
+ * reading from one cpu area. That does not matter as long
+ * as we end up on the original cpu again when doing the cmpxchg.
+ *
+ * Preemption is disabled for the retrieval of the tid because that
+ * must occur from the current processor. We cannot allow rescheduling
+ * on a different processor between the determination of the pointer
+ * and the retrieval of the tid.
+ */
+ preempt_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ /*
+ * The transaction ids are globally unique per cpu and per operation on
+ * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+ * occurs on the right processor and that there was no operation on the
+ * linked list in between.
+ */
+ tid = c->tid;
+ preempt_enable();
+
+ object = c->freelist;
+ page = c->page;
+ if (unlikely(!object || !node_match(page, node))) {
+ object = __slab_alloc(s, gfpflags, node, addr, c);
+ stat(s, ALLOC_SLOWPATH);
+ } else {
+ void *next_object = get_freepointer_safe(s, object);
+
+ /*
+ * The cmpxchg will only match if there was no additional
+ * operation and if we are on the right processor.
+ *
+ * The cmpxchg does the following atomically (without lock
+ * semantics!)
+ * 1. Relocate first pointer to the current per cpu area.
+ * 2. Verify that tid and freelist have not been changed
+ * 3. If they were not changed replace tid and freelist
+ *
+ * Since this is without lock semantics the protection is only
+ * against code executing on this cpu *not* from access by
+ * other cpus.
+ */
+ if (unlikely(!this_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ object, tid,
+ next_object, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_alloc", s, tid);
+ goto redo;
+ }
+ prefetch_freepointer(s, next_object);
+ stat(s, ALLOC_FASTPATH);
+ }
+
+ if (unlikely(gfpflags & __GFP_ZERO) && object)
+ memset(object, 0, s->object_size);
+
+ slab_post_alloc_hook(s, gfpflags, object);
+
+ return object;
+}
+
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, unsigned long addr)
+{
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+
+ trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
+ s->size, gfpflags);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
+
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+
+ trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ s->object_size, s->size, gfpflags, node);
+
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+#ifdef CONFIG_TRACING
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+ gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
+#endif
+
+/*
+ * Slow patch handling. This may still be called frequently since objects
+ * have a longer lifetime than the cpu slabs in most processing loads.
+ *
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
+ */
+static void __slab_free(struct kmem_cache *s, struct page *page,
+ void *x, unsigned long addr)
+{
+ void *prior;
+ void **object = (void *)x;
+ int was_frozen;
+ struct page new;
+ unsigned long counters;
+ struct kmem_cache_node *n = NULL;
+ unsigned long uninitialized_var(flags);
+
+ stat(s, FREE_SLOWPATH);
+
+ if (kmem_cache_debug(s) &&
+ !(n = free_debug_processing(s, page, x, addr, &flags)))
+ return;
+
+ do {
+ if (unlikely(n)) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ n = NULL;
+ }
+ prior = page->freelist;
+ counters = page->counters;
+ set_freepointer(s, object, prior);
+ new.counters = counters;
+ was_frozen = new.frozen;
+ new.inuse--;
+ if ((!new.inuse || !prior) && !was_frozen) {
+
+ if (kmem_cache_has_cpu_partial(s) && !prior) {
+
+ /*
+ * Slab was on no list before and will be
+ * partially empty
+ * We can defer the list move and instead
+ * freeze it.
+ */
+ new.frozen = 1;
+
+ } else { /* Needs to be taken off a list */
+
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ }
+ }
+
+ } while (!cmpxchg_double_slab(s, page,
+ prior, counters,
+ object, new.counters,
+ "__slab_free"));
+
+ if (likely(!n)) {
+
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
+ if (new.frozen && !was_frozen) {
+ put_cpu_partial(s, page, 1);
+ stat(s, CPU_PARTIAL_FREE);
+ }
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ if (was_frozen)
+ stat(s, FREE_FROZEN);
+ return;
+ }
+
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
+ goto slab_empty;
+
+ /*
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
+ */
+ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
+ if (kmem_cache_debug(s))
+ remove_full(s, n, page);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return;
+
+slab_empty:
+ if (prior) {
+ /*
+ * Slab on the partial list.
+ */
+ remove_partial(n, page);
+ stat(s, FREE_REMOVE_PARTIAL);
+ } else {
+ /* Slab must be on the full list */
+ remove_full(s, n, page);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ stat(s, FREE_SLAB);
+ discard_slab(s, page);
+}
+
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ */
+static __always_inline void slab_free(struct kmem_cache *s,
+ struct page *page, void *x, unsigned long addr)
+{
+ void **object = (void *)x;
+ struct kmem_cache_cpu *c;
+ unsigned long tid;
+
+ slab_free_hook(s, x);
+
+redo:
+ /*
+ * Determine the currently cpus per cpu slab.
+ * The cpu may change afterward. However that does not matter since
+ * data is retrieved via this pointer. If we are on the same cpu
+ * during the cmpxchg then the free will succedd.
+ */
+ preempt_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ tid = c->tid;
+ preempt_enable();
+
+ if (likely(page == c->page)) {
+ set_freepointer(s, object, c->freelist);
+
+ if (unlikely(!this_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ c->freelist, tid,
+ object, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_free", s, tid);
+ goto redo;
+ }
+ stat(s, FREE_FASTPATH);
+ } else
+ __slab_free(s, page, x, addr);
+
+}
+
+void kmem_cache_free(struct kmem_cache *s, void *x)
+{
+ s = cache_from_obj(s, x);
+ if (!s)
+ return;
+ slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+ trace_kmem_cache_free(_RET_IP_, x);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+/*
+ * Object placement in a slab is made very easy because we always start at
+ * offset 0. If we tune the size of the object to the alignment then we can
+ * get the required alignment by putting one properly sized object after
+ * another.
+ *
+ * Notice that the allocation order determines the sizes of the per cpu
+ * caches. Each processor has always one slab available for allocations.
+ * Increasing the allocation order reduces the number of times that slabs
+ * must be moved on and off the partial lists and is therefore a factor in
+ * locking overhead.
+ */
+
+/*
+ * Mininum / Maximum order of slab pages. This influences locking overhead
+ * and slab fragmentation. A higher order reduces the number of partial slabs
+ * and increases the number of allocations possible without having to
+ * take the list_lock.
+ */
+static int slub_min_order;
+static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
+static int slub_min_objects;
+
+/*
+ * Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
+ */
+static int slub_nomerge;
+
+/*
+ * Calculate the order of allocation given an slab object size.
+ *
+ * The order of allocation has significant impact on performance and other
+ * system components. Generally order 0 allocations should be preferred since
+ * order 0 does not cause fragmentation in the page allocator. Larger objects
+ * be problematic to put into order 0 slabs because there may be too much
+ * unused space left. We go to a higher order if more than 1/16th of the slab
+ * would be wasted.
+ *
+ * In order to reach satisfactory performance we must ensure that a minimum
+ * number of objects is in one slab. Otherwise we may generate too much
+ * activity on the partial lists which requires taking the list_lock. This is
+ * less a concern for large slabs though which are rarely used.
+ *
+ * slub_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slub_max_order then
+ * we try to keep the page order as low as possible. So we accept more waste
+ * of space in favor of a small page order.
+ *
+ * Higher order allocations also allow the placement of more objects in a
+ * slab and thereby reduce object handling overhead. If the user has
+ * requested a higher mininum order then we start with that one instead of
+ * the smallest order which will fit the object.
+ */
+static inline int slab_order(int size, int min_objects,
+ int max_order, int fract_leftover, int reserved)
+{
+ int order;
+ int rem;
+ int min_order = slub_min_order;
+
+ if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
+ return get_order(size * MAX_OBJS_PER_PAGE) - 1;
+
+ for (order = max(min_order,
+ fls(min_objects * size - 1) - PAGE_SHIFT);
+ order <= max_order; order++) {
+
+ unsigned long slab_size = PAGE_SIZE << order;
+
+ if (slab_size < min_objects * size + reserved)
+ continue;
+
+ rem = (slab_size - reserved) % size;
+
+ if (rem <= slab_size / fract_leftover)
+ break;
+
+ }
+
+ return order;
+}
+
+static inline int calculate_order(int size, int reserved)
+{
+ int order;
+ int min_objects;
+ int fraction;
+ int max_objects;
+
+ /*
+ * Attempt to find best configuration for a slab. This
+ * works by first attempting to generate a layout with
+ * the best configuration and backing off gradually.
+ *
+ * First we reduce the acceptable waste in a slab. Then
+ * we reduce the minimum objects required in a slab.
+ */
+ min_objects = slub_min_objects;
+ if (!min_objects)
+ min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ max_objects = order_objects(slub_max_order, size, reserved);
+ min_objects = min(min_objects, max_objects);
+
+ while (min_objects > 1) {
+ fraction = 16;
+ while (fraction >= 4) {
+ order = slab_order(size, min_objects,
+ slub_max_order, fraction, reserved);
+ if (order <= slub_max_order)
+ return order;
+ fraction /= 2;
+ }
+ min_objects--;
+ }
+
+ /*
+ * We were unable to place multiple objects in a slab. Now
+ * lets see if we can place a single object there.
+ */
+ order = slab_order(size, 1, slub_max_order, 1, reserved);
+ if (order <= slub_max_order)
+ return order;
+
+ /*
+ * Doh this slab cannot be placed using slub_max_order.
+ */
+ order = slab_order(size, 1, MAX_ORDER, 1, reserved);
+ if (order < MAX_ORDER)
+ return order;
+ return -ENOSYS;
+}
+
+static void
+init_kmem_cache_node(struct kmem_cache_node *n)
+{
+ n->nr_partial = 0;
+ spin_lock_init(&n->list_lock);
+ INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_set(&n->nr_slabs, 0);
+ atomic_long_set(&n->total_objects, 0);
+ INIT_LIST_HEAD(&n->full);
+#endif
+}
+
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+{
+ BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
+ KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
+
+ /*
+ * Must align to double word boundary for the double cmpxchg
+ * instructions to work; see __pcpu_double_call_return_bool().
+ */
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+ 2 * sizeof(void *));
+
+ if (!s->cpu_slab)
+ return 0;
+
+ init_kmem_cache_cpus(s);
+
+ return 1;
+}
+
+static struct kmem_cache *kmem_cache_node;
+
+/*
+ * No kmalloc_node yet so do it by hand. We know that this is the first
+ * slab on the node for this slabcache. There are no concurrent accesses
+ * possible.
+ *
+ * Note that this function only works on the kmem_cache_node
+ * when allocating for the kmem_cache_node. This is used for bootstrapping
+ * memory on a fresh node that has no slab structures yet.
+ */
+static void early_kmem_cache_node_alloc(int node)
+{
+ struct page *page;
+ struct kmem_cache_node *n;
+
+ BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
+
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+
+ BUG_ON(!page);
+ if (page_to_nid(page) != node) {
+ pr_err("SLUB: Unable to allocate memory from node %d\n", node);
+ pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
+ }
+
+ n = page->freelist;
+ BUG_ON(!n);
+ page->freelist = get_freepointer(kmem_cache_node, n);
+ page->inuse = 1;
+ page->frozen = 0;
+ kmem_cache_node->node[node] = n;
+#ifdef CONFIG_SLUB_DEBUG
+ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
+ init_tracking(kmem_cache_node, n);
+#endif
+ init_kmem_cache_node(n);
+ inc_slabs_node(kmem_cache_node, node, page->objects);
+
+ /*
+ * No locks need to be taken here as it has just been
+ * initialized and there is no concurrent access.
+ */
+ __add_partial(n, page, DEACTIVATE_TO_HEAD);
+}
+
+static void free_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = s->node[node];
+
+ if (n)
+ kmem_cache_free(kmem_cache_node, n);
+
+ s->node[node] = NULL;
+ }
+}
+
+static int init_kmem_cache_nodes(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n;
+
+ if (slab_state == DOWN) {
+ early_kmem_cache_node_alloc(node);
+ continue;
+ }
+ n = kmem_cache_alloc_node(kmem_cache_node,
+ GFP_KERNEL, node);
+
+ if (!n) {
+ free_kmem_cache_nodes(s);
+ return 0;
+ }
+
+ s->node[node] = n;
+ init_kmem_cache_node(n);
+ }
+ return 1;
+}
+
+static void set_min_partial(struct kmem_cache *s, unsigned long min)
+{
+ if (min < MIN_PARTIAL)
+ min = MIN_PARTIAL;
+ else if (min > MAX_PARTIAL)
+ min = MAX_PARTIAL;
+ s->min_partial = min;
+}
+
+/*
+ * calculate_sizes() determines the order and the distribution of data within
+ * a slab object.
+ */
+static int calculate_sizes(struct kmem_cache *s, int forced_order)
+{
+ unsigned long flags = s->flags;
+ unsigned long size = s->object_size;
+ int order;
+
+ /*
+ * Round up object size to the next word boundary. We can only
+ * place the free pointer at word boundaries and this determines
+ * the possible location of the free pointer.
+ */
+ size = ALIGN(size, sizeof(void *));
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Determine if we can poison the object itself. If the user of
+ * the slab may touch the object after free or before allocation
+ * then we should never poison the object itself.
+ */
+ if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
+ !s->ctor)
+ s->flags |= __OBJECT_POISON;
+ else
+ s->flags &= ~__OBJECT_POISON;
+
+
+ /*
+ * If we are Redzoning then check if there is some space between the
+ * end of the object and the free pointer. If not then add an
+ * additional word to have some bytes to store Redzone information.
+ */
+ if ((flags & SLAB_RED_ZONE) && size == s->object_size)
+ size += sizeof(void *);
+#endif
+
+ /*
+ * With that we have determined the number of bytes in actual use
+ * by the object. This is the potential offset to the free pointer.
+ */
+ s->inuse = size;
+
+ if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
+ s->ctor)) {
+ /*
+ * Relocate free pointer after the object if it is not
+ * permitted to overwrite the first word of the object on
+ * kmem_cache_free.
+ *
+ * This is the case if we do RCU, have a constructor or
+ * destructor or are poisoning the objects.
+ */
+ s->offset = size;
+ size += sizeof(void *);
+ }
+
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SLAB_STORE_USER)
+ /*
+ * Need to store information about allocs and frees after
+ * the object.
+ */
+ size += 2 * sizeof(struct track);
+
+ if (flags & SLAB_RED_ZONE)
+ /*
+ * Add some empty padding so that we can catch
+ * overwrites from earlier objects rather than let
+ * tracking information or the free pointer be
+ * corrupted if a user writes before the start
+ * of the object.
+ */
+ size += sizeof(void *);
+#endif
+
+ /*
+ * SLUB stores one object immediately after another beginning from
+ * offset 0. In order to align the objects we have to simply size
+ * each object to conform to the alignment.
+ */
+ size = ALIGN(size, s->align);
+ s->size = size;
+ if (forced_order >= 0)
+ order = forced_order;
+ else
+ order = calculate_order(size, s->reserved);
+
+ if (order < 0)
+ return 0;
+
+ s->allocflags = 0;
+ if (order)
+ s->allocflags |= __GFP_COMP;
+
+ if (s->flags & SLAB_CACHE_DMA)
+ s->allocflags |= GFP_DMA;
+
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ s->allocflags |= __GFP_RECLAIMABLE;
+
+ /*
+ * Determine the number of objects per slab
+ */
+ s->oo = oo_make(order, size, s->reserved);
+ s->min = oo_make(get_order(size), size, s->reserved);
+ if (oo_objects(s->oo) > oo_objects(s->max))
+ s->max = s->oo;
+
+ return !!oo_objects(s->oo);
+}
+
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
+{
+ s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
+ s->reserved = 0;
+
+ if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+ s->reserved = sizeof(struct rcu_head);
+
+ if (!calculate_sizes(s, -1))
+ goto error;
+ if (disable_higher_order_debug) {
+ /*
+ * Disable debugging flags that store metadata if the min slab
+ * order increased.
+ */
+ if (get_order(s->size) > get_order(s->object_size)) {
+ s->flags &= ~DEBUG_METADATA_FLAGS;
+ s->offset = 0;
+ if (!calculate_sizes(s, -1))
+ goto error;
+ }
+ }
+
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ /* Enable fast mode */
+ s->flags |= __CMPXCHG_DOUBLE;
+#endif
+
+ /*
+ * The larger the object size is, the more pages we want on the partial
+ * list to avoid pounding the page allocator excessively.
+ */
+ set_min_partial(s, ilog2(s->size) / 2);
+
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch
+ * 50% to keep some capacity around for frees.
+ */
+ if (!kmem_cache_has_cpu_partial(s))
+ s->cpu_partial = 0;
+ else if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+
+#ifdef CONFIG_NUMA
+ s->remote_node_defrag_ratio = 1000;
+#endif
+ if (!init_kmem_cache_nodes(s))
+ goto error;
+
+ if (alloc_kmem_cache_cpus(s))
+ return 0;
+
+ free_kmem_cache_nodes(s);
+error:
+ if (flags & SLAB_PANIC)
+ panic("Cannot create slab %s size=%lu realsize=%u "
+ "order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)s->size, s->size,
+ oo_order(s->oo), s->offset, flags);
+ return -EINVAL;
+}
+
+static void list_slab_objects(struct kmem_cache *s, struct page *page,
+ const char *text)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ void *addr = page_address(page);
+ void *p;
+ unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
+ sizeof(long), GFP_ATOMIC);
+ if (!map)
+ return;
+ slab_err(s, page, text, s->name);
+ slab_lock(page);
+
+ get_map(s, page, map);
+ for_each_object(p, s, addr, page->objects) {
+
+ if (!test_bit(slab_index(p, s, addr), map)) {
+ pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
+ print_tracking(s, p);
+ }
+ }
+ slab_unlock(page);
+ kfree(map);
+#endif
+}
+
+/*
+ * Attempt to free all partial slabs on a node.
+ * This is called from kmem_cache_close(). We must be the last thread
+ * using the cache and therefore we do not need to lock anymore.
+ */
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+{
+ struct page *page, *h;
+
+ list_for_each_entry_safe(page, h, &n->partial, lru) {
+ if (!page->inuse) {
+ __remove_partial(n, page);
+ discard_slab(s, page);
+ } else {
+ list_slab_objects(s, page,
+ "Objects remaining in %s on kmem_cache_close()");
+ }
+ }
+}
+
+/*
+ * Release all resources used by a slab cache.
+ */
+static inline int kmem_cache_close(struct kmem_cache *s)
+{
+ int node;
+
+ flush_all(s);
+ /* Attempt to free all objects */
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ free_partial(s, n);
+ if (n->nr_partial || slabs_node(s, node))
+ return 1;
+ }
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+ return 0;
+}
+
+int __kmem_cache_shutdown(struct kmem_cache *s)
+{
+ return kmem_cache_close(s);
+}
+
+/********************************************************************
+ * Kmalloc subsystem
+ *******************************************************************/
+
+static int __init setup_slub_min_order(char *str)
+{
+ get_option(&str, &slub_min_order);
+
+ return 1;
+}
+
+__setup("slub_min_order=", setup_slub_min_order);
+
+static int __init setup_slub_max_order(char *str)
+{
+ get_option(&str, &slub_max_order);
+ slub_max_order = min(slub_max_order, MAX_ORDER - 1);
+
+ return 1;
+}
+
+__setup("slub_max_order=", setup_slub_max_order);
+
+static int __init setup_slub_min_objects(char *str)
+{
+ get_option(&str, &slub_min_objects);
+
+ return 1;
+}
+
+__setup("slub_min_objects=", setup_slub_min_objects);
+
+static int __init setup_slub_nomerge(char *str)
+{
+ slub_nomerge = 1;
+ return 1;
+}
+
+__setup("slub_nomerge", setup_slub_nomerge);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large(size, flags);
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc(s, flags, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc);
+
+#ifdef CONFIG_NUMA
+static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+{
+ struct page *page;
+ void *ptr = NULL;
+
+ flags |= __GFP_COMP | __GFP_NOTRACK;
+ page = alloc_kmem_pages_node(node, flags, get_order(size));
+ if (page)
+ ptr = page_address(page);
+
+ kmalloc_large_node_hook(ptr, size, flags);
+ return ptr;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, PAGE_SIZE << get_order(size),
+ flags, node);
+
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc_node(s, flags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif
+
+size_t ksize(const void *object)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page))) {
+ WARN_ON(!PageCompound(page));
+ return PAGE_SIZE << compound_order(page);
+ }
+
+ return slab_ksize(page->slab_cache);
+}
+EXPORT_SYMBOL(ksize);
+
+void kfree(const void *x)
+{
+ struct page *page;
+ void *object = (void *)x;
+
+ trace_kfree(_RET_IP_, x);
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return;
+
+ page = virt_to_head_page(x);
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(x);
+ __free_kmem_pages(page, compound_order(page));
+ return;
+ }
+ slab_free(page->slab_cache, page, object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * the remaining slabs by the number of items in use. The slabs with the
+ * most items in use come first. New allocations will then fill those up
+ * and thus they can be removed from the partial lists.
+ *
+ * The slabs with the least items are placed last. This results in them
+ * being allocated from last increasing the chance that the last objects
+ * are freed in them.
+ */
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+ int i;
+ struct kmem_cache_node *n;
+ struct page *page;
+ struct page *t;
+ int objects = oo_objects(s->max);
+ struct list_head *slabs_by_inuse =
+ kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ unsigned long flags;
+
+ if (!slabs_by_inuse)
+ return -ENOMEM;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ n = get_node(s, node);
+
+ if (!n->nr_partial)
+ continue;
+
+ for (i = 0; i < objects; i++)
+ INIT_LIST_HEAD(slabs_by_inuse + i);
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ /*
+ * Build lists indexed by the items in use in each slab.
+ *
+ * Note that concurrent frees may occur while we hold the
+ * list_lock. page->inuse here is the upper limit.
+ */
+ list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_move(&page->lru, slabs_by_inuse + page->inuse);
+ if (!page->inuse)
+ n->nr_partial--;
+ }
+
+ /*
+ * Rebuild the partial list with the slabs filled up most
+ * first and the least used slabs at the end.
+ */
+ for (i = objects - 1; i > 0; i--)
+ list_splice(slabs_by_inuse + i, n->partial.prev);
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Release empty slabs */
+ list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ discard_slab(s, page);
+ }
+
+ kfree(slabs_by_inuse);
+ return 0;
+}
+
+static int slab_mem_going_offline_callback(void *arg)
+{
+ struct kmem_cache *s;
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list)
+ __kmem_cache_shrink(s);
+ mutex_unlock(&slab_mutex);
+
+ return 0;
+}
+
+static void slab_mem_offline_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int offline_node;
+
+ offline_node = marg->status_change_nid_normal;
+
+ /*
+ * If the node still has available memory. we need kmem_cache_node
+ * for it yet.
+ */
+ if (offline_node < 0)
+ return;
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ n = get_node(s, offline_node);
+ if (n) {
+ /*
+ * if n->nr_slabs > 0, slabs still exist on the node
+ * that is going down. We were unable to free them,
+ * and offline_pages() function shouldn't call this
+ * callback. So, we must fail.
+ */
+ BUG_ON(slabs_node(s, offline_node));
+
+ s->node[offline_node] = NULL;
+ kmem_cache_free(kmem_cache_node, n);
+ }
+ }
+ mutex_unlock(&slab_mutex);
+}
+
+static int slab_mem_going_online_callback(void *arg)
+{
+ struct kmem_cache_node *n;
+ struct kmem_cache *s;
+ struct memory_notify *marg = arg;
+ int nid = marg->status_change_nid_normal;
+ int ret = 0;
+
+ /*
+ * If the node's memory is already available, then kmem_cache_node is
+ * already created. Nothing to do.
+ */
+ if (nid < 0)
+ return 0;
+
+ /*
+ * We are bringing a node online. No memory is available yet. We must
+ * allocate a kmem_cache_node structure in order to bring the node
+ * online.
+ */
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ /*
+ * XXX: kmem_cache_alloc_node will fallback to other nodes
+ * since memory is not yet available from the node that
+ * is brought up.
+ */
+ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ init_kmem_cache_node(n);
+ s->node[nid] = n;
+ }
+out:
+ mutex_unlock(&slab_mutex);
+ return ret;
+}
+
+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ ret = slab_mem_going_online_callback(arg);
+ break;
+ case MEM_GOING_OFFLINE:
+ ret = slab_mem_going_offline_callback(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_ONLINE:
+ slab_mem_offline_callback(arg);
+ break;
+ case MEM_ONLINE:
+ case MEM_CANCEL_OFFLINE:
+ break;
+ }
+ if (ret)
+ ret = notifier_from_errno(ret);
+ else
+ ret = NOTIFY_OK;
+ return ret;
+}
+
+static struct notifier_block slab_memory_callback_nb = {
+ .notifier_call = slab_memory_callback,
+ .priority = SLAB_CALLBACK_PRI,
+};
+
+/********************************************************************
+ * Basic setup of slabs
+ *******************************************************************/
+
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
+ */
+
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
+{
+ int node;
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ memcpy(s, static_cache, kmem_cache->object_size);
+
+ /*
+ * This runs very early, and only the boot processor is supposed to be
+ * up. Even if it weren't true, IRQs are not up so we couldn't fire
+ * IPIs around.
+ */
+ __flush_cpu_slab(s, smp_processor_id());
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ struct page *p;
+
+ if (n) {
+ list_for_each_entry(p, &n->partial, lru)
+ p->slab_cache = s;
+
+#ifdef CONFIG_SLUB_DEBUG
+ list_for_each_entry(p, &n->full, lru)
+ p->slab_cache = s;
+#endif
+ }
+ }
+ list_add(&s->list, &slab_caches);
+ return s;
+}
+
+void __init kmem_cache_init(void)
+{
+ static __initdata struct kmem_cache boot_kmem_cache,
+ boot_kmem_cache_node;
+
+ if (debug_guardpage_minorder())
+ slub_max_order = 0;
+
+ kmem_cache_node = &boot_kmem_cache_node;
+ kmem_cache = &boot_kmem_cache;
+
+ create_boot_cache(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
+
+ register_hotmemory_notifier(&slab_memory_callback_nb);
+
+ /* Able to allocate the per node structures */
+ slab_state = PARTIAL;
+
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN);
+
+ kmem_cache = bootstrap(&boot_kmem_cache);
+
+ /*
+ * Allocate kmem_cache_node properly from the kmem_cache slab.
+ * kmem_cache_node is separately allocated so no need to
+ * update any list pointers.
+ */
+ kmem_cache_node = bootstrap(&boot_kmem_cache_node);
+
+ /* Now we can use the kmem_cache to allocate kmalloc slabs */
+ create_kmalloc_caches(0);
+
+#ifdef CONFIG_SMP
+ register_cpu_notifier(&slab_notifier);
+#endif
+
+ pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
+ cache_line_size(),
+ slub_min_order, slub_max_order, slub_min_objects,
+ nr_cpu_ids, nr_node_ids);
+}
+
+void __init kmem_cache_init_late(void)
+{
+}
+
+/*
+ * Find a mergeable slab cache
+ */
+static int slab_unmergeable(struct kmem_cache *s)
+{
+ if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
+ return 1;
+
+ if (!is_root_cache(s))
+ return 1;
+
+ if (s->ctor)
+ return 1;
+
+ /*
+ * We may have set a slab to be unmergeable during bootstrap.
+ */
+ if (s->refcount < 0)
+ return 1;
+
+ return 0;
+}
+
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
+ return NULL;
+
+ if (ctor)
+ return NULL;
+
+ size = ALIGN(size, sizeof(void *));
+ align = calculate_alignment(flags, align, size);
+ size = ALIGN(size, align);
+ flags = kmem_cache_flags(size, flags, name, NULL);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (slab_unmergeable(s))
+ continue;
+
+ if (size > s->size)
+ continue;
+
+ if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
+ continue;
+ /*
+ * Check if alignment is compatible.
+ * Courtesy of Adrian Drzewiecki
+ */
+ if ((s->size & ~(align - 1)) != s->size)
+ continue;
+
+ if (s->size - size >= sizeof(void *))
+ continue;
+
+ return s;
+ }
+ return NULL;
+}
+
+struct kmem_cache *
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ s = find_mergeable(size, align, flags, name, ctor);
+ if (s) {
+ int i;
+ struct kmem_cache *c;
+
+ s->refcount++;
+
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ s->object_size = max(s->object_size, (int)size);
+ s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+ c->object_size = s->object_size;
+ c->inuse = max_t(int, c->inuse,
+ ALIGN(size, sizeof(void *)));
+ }
+
+ if (sysfs_slab_alias(s, name)) {
+ s->refcount--;
+ s = NULL;
+ }
+ }
+
+ return s;
+}
+
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
+{
+ int err;
+
+ err = kmem_cache_open(s, flags);
+ if (err)
+ return err;
+
+ /* Mutex is not taken during early boot */
+ if (slab_state <= UP)
+ return 0;
+
+ memcg_propagate_slab_attrs(s);
+ err = sysfs_slab_add(s);
+ if (err)
+ kmem_cache_close(s);
+
+ return err;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
+ */
+static int slab_cpuup_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct kmem_cache *s;
+ unsigned long flags;
+
+ switch (action) {
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ local_irq_save(flags);
+ __flush_cpu_slab(s, cpu);
+ local_irq_restore(flags);
+ }
+ mutex_unlock(&slab_mutex);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block slab_notifier = {
+ .notifier_call = slab_cpuup_callback
+};
+
+#endif
+
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large(size, gfpflags);
+
+ s = kmalloc_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc(s, gfpflags, caller);
+
+ /* Honor the call site pointer we received. */
+ trace_kmalloc(caller, ret, size, s->size, gfpflags);
+
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+ int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = kmalloc_large_node(size, gfpflags, node);
+
+ trace_kmalloc_node(caller, ret,
+ size, PAGE_SIZE << get_order(size),
+ gfpflags, node);
+
+ return ret;
+ }
+
+ s = kmalloc_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = slab_alloc_node(s, gfpflags, node, caller);
+
+ /* Honor the call site pointer we received. */
+ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_SYSFS
+static int count_inuse(struct page *page)
+{
+ return page->inuse;
+}
+
+static int count_total(struct page *page)
+{
+ return page->objects;
+}
+#endif
+
+#ifdef CONFIG_SLUB_DEBUG
+static int validate_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ if (!check_slab(s, page) ||
+ !on_freelist(s, page, NULL))
+ return 0;
+
+ /* Now we know that a valid freelist exists */
+ bitmap_zero(map, page->objects);
+
+ get_map(s, page, map);
+ for_each_object(p, s, addr, page->objects) {
+ if (test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+ return 0;
+ }
+
+ for_each_object(p, s, addr, page->objects)
+ if (!test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, SLUB_RED_ACTIVE))
+ return 0;
+ return 1;
+}
+
+static void validate_slab_slab(struct kmem_cache *s, struct page *page,
+ unsigned long *map)
+{
+ slab_lock(page);
+ validate_slab(s, page, map);
+ slab_unlock(page);
+}
+
+static int validate_slab_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, unsigned long *map)
+{
+ unsigned long count = 0;
+ struct page *page;
+ unsigned long flags;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry(page, &n->partial, lru) {
+ validate_slab_slab(s, page, map);
+ count++;
+ }
+ if (count != n->nr_partial)
+ pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
+ s->name, count, n->nr_partial);
+
+ if (!(s->flags & SLAB_STORE_USER))
+ goto out;
+
+ list_for_each_entry(page, &n->full, lru) {
+ validate_slab_slab(s, page, map);
+ count++;
+ }
+ if (count != atomic_long_read(&n->nr_slabs))
+ pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
+ s->name, count, atomic_long_read(&n->nr_slabs));
+
+out:
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return count;
+}
+
+static long validate_slab_cache(struct kmem_cache *s)
+{
+ int node;
+ unsigned long count = 0;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+ sizeof(unsigned long), GFP_KERNEL);
+
+ if (!map)
+ return -ENOMEM;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ count += validate_slab_node(s, n, map);
+ }
+ kfree(map);
+ return count;
+}
+/*
+ * Generate lists of code addresses where slabcache objects are allocated
+ * and freed.
+ */
+
+struct location {
+ unsigned long count;
+ unsigned long addr;
+ long long sum_time;
+ long min_time;
+ long max_time;
+ long min_pid;
+ long max_pid;
+ DECLARE_BITMAP(cpus, NR_CPUS);
+ nodemask_t nodes;
+};
+
+struct loc_track {
+ unsigned long max;
+ unsigned long count;
+ struct location *loc;
+};
+
+static void free_loc_track(struct loc_track *t)
+{
+ if (t->max)
+ free_pages((unsigned long)t->loc,
+ get_order(sizeof(struct location) * t->max));
+}
+
+static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
+{
+ struct location *l;
+ int order;
+
+ order = get_order(sizeof(struct location) * max);
+
+ l = (void *)__get_free_pages(flags, order);
+ if (!l)
+ return 0;
+
+ if (t->count) {
+ memcpy(l, t->loc, sizeof(struct location) * t->count);
+ free_loc_track(t);
+ }
+ t->max = max;
+ t->loc = l;
+ return 1;
+}
+
+static int add_location(struct loc_track *t, struct kmem_cache *s,
+ const struct track *track)
+{
+ long start, end, pos;
+ struct location *l;
+ unsigned long caddr;
+ unsigned long age = jiffies - track->when;
+
+ start = -1;
+ end = t->count;
+
+ for ( ; ; ) {
+ pos = start + (end - start + 1) / 2;
+
+ /*
+ * There is nothing at "end". If we end up there
+ * we need to add something to before end.
+ */
+ if (pos == end)
+ break;
+
+ caddr = t->loc[pos].addr;
+ if (track->addr == caddr) {
+
+ l = &t->loc[pos];
+ l->count++;
+ if (track->when) {
+ l->sum_time += age;
+ if (age < l->min_time)
+ l->min_time = age;
+ if (age > l->max_time)
+ l->max_time = age;
+
+ if (track->pid < l->min_pid)
+ l->min_pid = track->pid;
+ if (track->pid > l->max_pid)
+ l->max_pid = track->pid;
+
+ cpumask_set_cpu(track->cpu,
+ to_cpumask(l->cpus));
+ }
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+ }
+
+ if (track->addr < caddr)
+ end = pos;
+ else
+ start = pos;
+ }
+
+ /*
+ * Not found. Insert new tracking element.
+ */
+ if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
+ return 0;
+
+ l = t->loc + pos;
+ if (pos < t->count)
+ memmove(l + 1, l,
+ (t->count - pos) * sizeof(struct location));
+ t->count++;
+ l->count = 1;
+ l->addr = track->addr;
+ l->sum_time = age;
+ l->min_time = age;
+ l->max_time = age;
+ l->min_pid = track->pid;
+ l->max_pid = track->pid;
+ cpumask_clear(to_cpumask(l->cpus));
+ cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
+ nodes_clear(l->nodes);
+ node_set(page_to_nid(virt_to_page(track)), l->nodes);
+ return 1;
+}
+
+static void process_slab(struct loc_track *t, struct kmem_cache *s,
+ struct page *page, enum track_item alloc,
+ unsigned long *map)
+{
+ void *addr = page_address(page);
+ void *p;
+
+ bitmap_zero(map, page->objects);
+ get_map(s, page, map);
+
+ for_each_object(p, s, addr, page->objects)
+ if (!test_bit(slab_index(p, s, addr), map))
+ add_location(t, s, get_track(s, p, alloc));
+}
+
+static int list_locations(struct kmem_cache *s, char *buf,
+ enum track_item alloc)
+{
+ int len = 0;
+ unsigned long i;
+ struct loc_track t = { 0, 0, NULL };
+ int node;
+ unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+ sizeof(unsigned long), GFP_KERNEL);
+
+ if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_TEMPORARY)) {
+ kfree(map);
+ return sprintf(buf, "Out of memory\n");
+ }
+ /* Push back cpu slabs */
+ flush_all(s);
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ unsigned long flags;
+ struct page *page;
+
+ if (!atomic_long_read(&n->nr_slabs))
+ continue;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry(page, &n->partial, lru)
+ process_slab(&t, s, page, alloc, map);
+ list_for_each_entry(page, &n->full, lru)
+ process_slab(&t, s, page, alloc, map);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ for (i = 0; i < t.count; i++) {
+ struct location *l = &t.loc[i];
+
+ if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
+ break;
+ len += sprintf(buf + len, "%7ld ", l->count);
+
+ if (l->addr)
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
+ else
+ len += sprintf(buf + len, "<not-available>");
+
+ if (l->sum_time != l->min_time) {
+ len += sprintf(buf + len, " age=%ld/%ld/%ld",
+ l->min_time,
+ (long)div_u64(l->sum_time, l->count),
+ l->max_time);
+ } else
+ len += sprintf(buf + len, " age=%ld",
+ l->min_time);
+
+ if (l->min_pid != l->max_pid)
+ len += sprintf(buf + len, " pid=%ld-%ld",
+ l->min_pid, l->max_pid);
+ else
+ len += sprintf(buf + len, " pid=%ld",
+ l->min_pid);
+
+ if (num_online_cpus() > 1 &&
+ !cpumask_empty(to_cpumask(l->cpus)) &&
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " cpus=");
+ len += cpulist_scnprintf(buf + len,
+ PAGE_SIZE - len - 50,
+ to_cpumask(l->cpus));
+ }
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
+ len < PAGE_SIZE - 60) {
+ len += sprintf(buf + len, " nodes=");
+ len += nodelist_scnprintf(buf + len,
+ PAGE_SIZE - len - 50,
+ l->nodes);
+ }
+
+ len += sprintf(buf + len, "\n");
+ }
+
+ free_loc_track(&t);
+ kfree(map);
+ if (!t.count)
+ len += sprintf(buf, "No data\n");
+ return len;
+}
+#endif
+
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+ u8 *p;
+
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
+
+ pr_err("SLUB resiliency testing\n");
+ pr_err("-----------------------\n");
+ pr_err("A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
+ p + 16);
+
+ validate_slab_cache(kmalloc_caches[4]);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches[5]);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches[6]);
+
+ pr_err("\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[7]);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[8]);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+
+#ifdef CONFIG_SYSFS
+enum slab_stat_type {
+ SL_ALL, /* All slabs */
+ SL_PARTIAL, /* Only partially allocated slabs */
+ SL_CPU, /* Only slabs used for cpu caches */
+ SL_OBJECTS, /* Determine allocated objects not slabs */
+ SL_TOTAL /* Determine object capacity not slabs */
+};
+
+#define SO_ALL (1 << SL_ALL)
+#define SO_PARTIAL (1 << SL_PARTIAL)
+#define SO_CPU (1 << SL_CPU)
+#define SO_OBJECTS (1 << SL_OBJECTS)
+#define SO_TOTAL (1 << SL_TOTAL)
+
+static ssize_t show_slab_objects(struct kmem_cache *s,
+ char *buf, unsigned long flags)
+{
+ unsigned long total = 0;
+ int node;
+ int x;
+ unsigned long *nodes;
+
+ nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
+ if (!nodes)
+ return -ENOMEM;
+
+ if (flags & SO_CPU) {
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
+ cpu);
+ int node;
+ struct page *page;
+
+ page = ACCESS_ONCE(c->page);
+ if (!page)
+ continue;
+
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ x = page->objects;
+ else if (flags & SO_OBJECTS)
+ x = page->inuse;
+ else
+ x = 1;
+
+ total += x;
+ nodes[node] += x;
+
+ page = ACCESS_ONCE(c->partial);
+ if (page) {
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ WARN_ON_ONCE(1);
+ else if (flags & SO_OBJECTS)
+ WARN_ON_ONCE(1);
+ else
+ x = page->pages;
+ total += x;
+ nodes[node] += x;
+ }
+ }
+ }
+
+ get_online_mems();
+#ifdef CONFIG_SLUB_DEBUG
+ if (flags & SO_ALL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = atomic_long_read(&n->total_objects);
+ else if (flags & SO_OBJECTS)
+ x = atomic_long_read(&n->total_objects) -
+ count_partial(n, count_free);
+ else
+ x = atomic_long_read(&n->nr_slabs);
+ total += x;
+ nodes[node] += x;
+ }
+
+ } else
+#endif
+ if (flags & SO_PARTIAL) {
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (flags & SO_TOTAL)
+ x = count_partial(n, count_total);
+ else if (flags & SO_OBJECTS)
+ x = count_partial(n, count_inuse);
+ else
+ x = n->nr_partial;
+ total += x;
+ nodes[node] += x;
+ }
+ }
+ x = sprintf(buf, "%lu", total);
+#ifdef CONFIG_NUMA
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ if (nodes[node])
+ x += sprintf(buf + x, " N%d=%lu",
+ node, nodes[node]);
+#endif
+ put_online_mems();
+ kfree(nodes);
+ return x + sprintf(buf + x, "\n");
+}
+
+#ifdef CONFIG_SLUB_DEBUG
+static int any_slab_objects(struct kmem_cache *s)
+{
+ int node;
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (!n)
+ continue;
+
+ if (atomic_long_read(&n->total_objects))
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
+#define to_slab(n) container_of(n, struct kmem_cache, kobj)
+
+struct slab_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kmem_cache *s, char *buf);
+ ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
+};
+
+#define SLAB_ATTR_RO(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0400, _name##_show, NULL)
+
+#define SLAB_ATTR(_name) \
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0600, _name##_show, _name##_store)
+
+static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->size);
+}
+SLAB_ATTR_RO(slab_size);
+
+static ssize_t align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->align);
+}
+SLAB_ATTR_RO(align);
+
+static ssize_t object_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->object_size);
+}
+SLAB_ATTR_RO(object_size);
+
+static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", oo_objects(s->oo));
+}
+SLAB_ATTR_RO(objs_per_slab);
+
+static ssize_t order_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long order;
+ int err;
+
+ err = kstrtoul(buf, 10, &order);
+ if (err)
+ return err;
+
+ if (order > slub_max_order || order < slub_min_order)
+ return -EINVAL;
+
+ calculate_sizes(s, order);
+ return length;
+}
+
+static ssize_t order_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", oo_order(s->oo));
+}
+SLAB_ATTR(order);
+
+static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%lu\n", s->min_partial);
+}
+
+static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 10, &min);
+ if (err)
+ return err;
+
+ set_min_partial(s, min);
+ return length;
+}
+SLAB_ATTR(min_partial);
+
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->cpu_partial);
+}
+
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long objects;
+ int err;
+
+ err = kstrtoul(buf, 10, &objects);
+ if (err)
+ return err;
+ if (objects && !kmem_cache_has_cpu_partial(s))
+ return -EINVAL;
+
+ s->cpu_partial = objects;
+ flush_all(s);
+ return length;
+}
+SLAB_ATTR(cpu_partial);
+
+static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+{
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
+}
+SLAB_ATTR_RO(ctor);
+
+static ssize_t aliases_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->refcount - 1);
+}
+SLAB_ATTR_RO(aliases);
+
+static ssize_t partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL);
+}
+SLAB_ATTR_RO(partial);
+
+static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_CPU);
+}
+SLAB_ATTR_RO(cpu_slabs);
+
+static ssize_t objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects);
+
+static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
+}
+SLAB_ATTR_RO(objects_partial);
+
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ int objects = 0;
+ int pages = 0;
+ int cpu;
+ int len;
+
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+
+ if (page) {
+ pages += page->pages;
+ objects += page->pobjects;
+ }
+ }
+
+ len = sprintf(buf, "%d(%d)", objects, pages);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+
+ if (page && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+ page->pobjects, page->pages);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
+
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+}
+
+static ssize_t reclaim_account_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ s->flags &= ~SLAB_RECLAIM_ACCOUNT;
+ if (buf[0] == '1')
+ s->flags |= SLAB_RECLAIM_ACCOUNT;
+ return length;
+}
+SLAB_ATTR(reclaim_account);
+
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+}
+SLAB_ATTR_RO(hwcache_align);
+
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+}
+SLAB_ATTR_RO(cache_dma);
+#endif
+
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+}
+SLAB_ATTR_RO(destroy_by_rcu);
+
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
+
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL);
+}
+SLAB_ATTR_RO(slabs);
+
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+{
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
+}
+SLAB_ATTR_RO(total_objects);
+
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+}
+
+static ssize_t sanity_checks_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ s->flags &= ~SLAB_DEBUG_FREE;
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
+ s->flags |= SLAB_DEBUG_FREE;
+ }
+ return length;
+}
+SLAB_ATTR(sanity_checks);
+
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+}
+
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_TRACE;
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
+ s->flags |= SLAB_TRACE;
+ }
+ return length;
+}
+SLAB_ATTR(trace);
+
+static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+}
+
+static ssize_t red_zone_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_RED_ZONE;
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
+ s->flags |= SLAB_RED_ZONE;
+ }
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(red_zone);
+
+static ssize_t poison_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+}
+
+static ssize_t poison_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_POISON;
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
+ s->flags |= SLAB_POISON;
+ }
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(poison);
+
+static ssize_t store_user_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+}
+
+static ssize_t store_user_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (any_slab_objects(s))
+ return -EBUSY;
+
+ s->flags &= ~SLAB_STORE_USER;
+ if (buf[0] == '1') {
+ s->flags &= ~__CMPXCHG_DOUBLE;
+ s->flags |= SLAB_STORE_USER;
+ }
+ calculate_sizes(s, -1);
+ return length;
+}
+SLAB_ATTR(store_user);
+
+static ssize_t validate_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t validate_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = -EINVAL;
+
+ if (buf[0] == '1') {
+ ret = validate_slab_cache(s);
+ if (ret >= 0)
+ ret = length;
+ }
+ return ret;
+}
+SLAB_ATTR(validate);
+
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_FAILSLAB;
+ if (buf[0] == '1')
+ s->flags |= SLAB_FAILSLAB;
+ return length;
+}
+SLAB_ATTR(failslab);
+#endif
+
+static ssize_t shrink_show(struct kmem_cache *s, char *buf)
+{
+ return 0;
+}
+
+static ssize_t shrink_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ if (buf[0] == '1') {
+ int rc = kmem_cache_shrink(s);
+
+ if (rc)
+ return rc;
+ } else
+ return -EINVAL;
+ return length;
+}
+SLAB_ATTR(shrink);
+
+#ifdef CONFIG_NUMA
+static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
+}
+
+static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long ratio;
+ int err;
+
+ err = kstrtoul(buf, 10, &ratio);
+ if (err)
+ return err;
+
+ if (ratio <= 100)
+ s->remote_node_defrag_ratio = ratio * 10;
+
+ return length;
+}
+SLAB_ATTR(remote_node_defrag_ratio);
+#endif
+
+#ifdef CONFIG_SLUB_STATS
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+ unsigned long sum = 0;
+ int cpu;
+ int len;
+ int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) {
+ unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
+
+ data[cpu] = x;
+ sum += x;
+ }
+
+ len = sprintf(buf, "%lu", sum);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ if (data[cpu] && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
+ }
+#endif
+ kfree(data);
+ return len + sprintf(buf + len, "\n");
+}
+
+static void clear_stat(struct kmem_cache *s, enum stat_item si)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
+}
+
+#define STAT_ATTR(si, text) \
+static ssize_t text##_show(struct kmem_cache *s, char *buf) \
+{ \
+ return show_stat(s, buf, si); \
+} \
+static ssize_t text##_store(struct kmem_cache *s, \
+ const char *buf, size_t length) \
+{ \
+ if (buf[0] != '0') \
+ return -EINVAL; \
+ clear_stat(s, si); \
+ return length; \
+} \
+SLAB_ATTR(text); \
+
+STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
+STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_FASTPATH, free_fastpath);
+STAT_ATTR(FREE_SLOWPATH, free_slowpath);
+STAT_ATTR(FREE_FROZEN, free_frozen);
+STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
+STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
+STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
+STAT_ATTR(ALLOC_SLAB, alloc_slab);
+STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
+STAT_ATTR(FREE_SLAB, free_slab);
+STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
+STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
+STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
+STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
+STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
+STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
+STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
+STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
+STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
+STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
+#endif
+
+static struct attribute *slab_attrs[] = {
+ &slab_size_attr.attr,
+ &object_size_attr.attr,
+ &objs_per_slab_attr.attr,
+ &order_attr.attr,
+ &min_partial_attr.attr,
+ &cpu_partial_attr.attr,
+ &objects_attr.attr,
+ &objects_partial_attr.attr,
+ &partial_attr.attr,
+ &cpu_slabs_attr.attr,
+ &ctor_attr.attr,
+ &aliases_attr.attr,
+ &align_attr.attr,
+ &hwcache_align_attr.attr,
+ &reclaim_account_attr.attr,
+ &destroy_by_rcu_attr.attr,
+ &shrink_attr.attr,
+ &reserved_attr.attr,
+ &slabs_cpu_partial_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
+ &red_zone_attr.attr,
+ &poison_attr.attr,
+ &store_user_attr.attr,
+ &validate_attr.attr,
+ &alloc_calls_attr.attr,
+ &free_calls_attr.attr,
+#endif
+#ifdef CONFIG_ZONE_DMA
+ &cache_dma_attr.attr,
+#endif
+#ifdef CONFIG_NUMA
+ &remote_node_defrag_ratio_attr.attr,
+#endif
+#ifdef CONFIG_SLUB_STATS
+ &alloc_fastpath_attr.attr,
+ &alloc_slowpath_attr.attr,
+ &free_fastpath_attr.attr,
+ &free_slowpath_attr.attr,
+ &free_frozen_attr.attr,
+ &free_add_partial_attr.attr,
+ &free_remove_partial_attr.attr,
+ &alloc_from_partial_attr.attr,
+ &alloc_slab_attr.attr,
+ &alloc_refill_attr.attr,
+ &alloc_node_mismatch_attr.attr,
+ &free_slab_attr.attr,
+ &cpuslab_flush_attr.attr,
+ &deactivate_full_attr.attr,
+ &deactivate_empty_attr.attr,
+ &deactivate_to_head_attr.attr,
+ &deactivate_to_tail_attr.attr,
+ &deactivate_remote_frees_attr.attr,
+ &deactivate_bypass_attr.attr,
+ &order_fallback_attr.attr,
+ &cmpxchg_double_fail_attr.attr,
+ &cmpxchg_double_cpu_fail_attr.attr,
+ &cpu_partial_alloc_attr.attr,
+ &cpu_partial_free_attr.attr,
+ &cpu_partial_node_attr.attr,
+ &cpu_partial_drain_attr.attr,
+#endif
+#ifdef CONFIG_FAILSLAB
+ &failslab_attr.attr,
+#endif
+
+ NULL
+};
+
+static struct attribute_group slab_attr_group = {
+ .attrs = slab_attrs,
+};
+
+static ssize_t slab_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->show)
+ return -EIO;
+
+ err = attribute->show(s, buf);
+
+ return err;
+}
+
+static ssize_t slab_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct slab_attribute *attribute;
+ struct kmem_cache *s;
+ int err;
+
+ attribute = to_slab_attr(attr);
+ s = to_slab(kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+ if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+ int i;
+
+ mutex_lock(&slab_mutex);
+ if (s->max_attr_size < len)
+ s->max_attr_size = len;
+
+ /*
+ * This is a best effort propagation, so this function's return
+ * value will be determined by the parent cache only. This is
+ * basically because not all attributes will have a well
+ * defined semantics for rollbacks - most of the actions will
+ * have permanent effects.
+ *
+ * Returning the error value of any of the children that fail
+ * is not 100 % defined, in the sense that users seeing the
+ * error code won't be able to know anything about the state of
+ * the cache.
+ *
+ * Only returning the error code for the parent cache at least
+ * has well defined semantics. The cache being written to
+ * directly either failed or succeeded, in which case we loop
+ * through the descendants with best-effort propagation.
+ */
+ for_each_memcg_cache_index(i) {
+ struct kmem_cache *c = cache_from_memcg_idx(s, i);
+ if (c)
+ attribute->store(c, buf, len);
+ }
+ mutex_unlock(&slab_mutex);
+ }
+#endif
+ return err;
+}
+
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ int i;
+ char *buffer = NULL;
+ struct kmem_cache *root_cache;
+
+ if (is_root_cache(s))
+ return;
+
+ root_cache = s->memcg_params->root_cache;
+
+ /*
+ * This mean this cache had no attribute written. Therefore, no point
+ * in copying default values around
+ */
+ if (!root_cache->max_attr_size)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+ char mbuf[64];
+ char *buf;
+ struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+
+ if (!attr || !attr->store || !attr->show)
+ continue;
+
+ /*
+ * It is really bad that we have to allocate here, so we will
+ * do it only as a fallback. If we actually allocate, though,
+ * we can just use the allocated buffer until the end.
+ *
+ * Most of the slub attributes will tend to be very small in
+ * size, but sysfs allows buffers up to a page, so they can
+ * theoretically happen.
+ */
+ if (buffer)
+ buf = buffer;
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ buf = mbuf;
+ else {
+ buffer = (char *) get_zeroed_page(GFP_KERNEL);
+ if (WARN_ON(!buffer))
+ continue;
+ buf = buffer;
+ }
+
+ attr->show(root_cache, buf);
+ attr->store(s, buf, strlen(buf));
+ }
+
+ if (buffer)
+ free_page((unsigned long)buffer);
+#endif
+}
+
+static void kmem_cache_release(struct kobject *k)
+{
+ slab_kmem_cache_release(to_slab(k));
+}
+
+static const struct sysfs_ops slab_sysfs_ops = {
+ .show = slab_attr_show,
+ .store = slab_attr_store,
+};
+
+static struct kobj_type slab_ktype = {
+ .sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release,
+};
+
+static int uevent_filter(struct kset *kset, struct kobject *kobj)
+{
+ struct kobj_type *ktype = get_ktype(kobj);
+
+ if (ktype == &slab_ktype)
+ return 1;
+ return 0;
+}
+
+static const struct kset_uevent_ops slab_uevent_ops = {
+ .filter = uevent_filter,
+};
+
+static struct kset *slab_kset;
+
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->memcg_kset;
+#endif
+ return slab_kset;
+}
+
+#define ID_STR_LENGTH 64
+
+/* Create a unique string id for a slab cache:
+ *
+ * Format :[flags-]size
+ */
+static char *create_unique_id(struct kmem_cache *s)
+{
+ char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
+ char *p = name;
+
+ BUG_ON(!name);
+
+ *p++ = ':';
+ /*
+ * First flags affecting slabcache operations. We will only
+ * get here for aliasable slabs so we do not need to support
+ * too many flags. The flags here must cover all flags that
+ * are matched during merging to guarantee that the id is
+ * unique.
+ */
+ if (s->flags & SLAB_CACHE_DMA)
+ *p++ = 'd';
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ *p++ = 'a';
+ if (s->flags & SLAB_DEBUG_FREE)
+ *p++ = 'F';
+ if (!(s->flags & SLAB_NOTRACK))
+ *p++ = 't';
+ if (p != name + 1)
+ *p++ = '-';
+ p += sprintf(p, "%07d", s->size);
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ p += sprintf(p, "-%08d",
+ memcg_cache_id(s->memcg_params->memcg));
+#endif
+
+ BUG_ON(p > name + ID_STR_LENGTH - 1);
+ return name;
+}
+
+static int sysfs_slab_add(struct kmem_cache *s)
+{
+ int err;
+ const char *name;
+ int unmergeable = slab_unmergeable(s);
+
+ if (unmergeable) {
+ /*
+ * Slabcache can never be merged so we can use the name proper.
+ * This is typically the case for debug situations. In that
+ * case we can catch duplicate names easily.
+ */
+ sysfs_remove_link(&slab_kset->kobj, s->name);
+ name = s->name;
+ } else {
+ /*
+ * Create a unique name for the slab as a target
+ * for the symlinks.
+ */
+ name = create_unique_id(s);
+ }
+
+ s->kobj.kset = cache_kset(s);
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
+ if (err)
+ goto out_put_kobj;
+
+ err = sysfs_create_group(&s->kobj, &slab_attr_group);
+ if (err)
+ goto out_del_kobj;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (is_root_cache(s)) {
+ s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+ if (!s->memcg_kset) {
+ err = -ENOMEM;
+ goto out_del_kobj;
+ }
+ }
+#endif
+
+ kobject_uevent(&s->kobj, KOBJ_ADD);
+ if (!unmergeable) {
+ /* Setup first alias */
+ sysfs_slab_alias(s, s->name);
+ }
+out:
+ if (!unmergeable)
+ kfree(name);
+ return err;
+out_del_kobj:
+ kobject_del(&s->kobj);
+out_put_kobj:
+ kobject_put(&s->kobj);
+ goto out;
+}
+
+void sysfs_slab_remove(struct kmem_cache *s)
+{
+ if (slab_state < FULL)
+ /*
+ * Sysfs has not been setup yet so no need to remove the
+ * cache from sysfs.
+ */
+ return;
+
+#ifdef CONFIG_MEMCG_KMEM
+ kset_unregister(s->memcg_kset);
+#endif
+ kobject_uevent(&s->kobj, KOBJ_REMOVE);
+ kobject_del(&s->kobj);
+ kobject_put(&s->kobj);
+}
+
+/*
+ * Need to buffer aliases during bootup until sysfs becomes
+ * available lest we lose that information.
+ */
+struct saved_alias {
+ struct kmem_cache *s;
+ const char *name;
+ struct saved_alias *next;
+};
+
+static struct saved_alias *alias_list;
+
+static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
+{
+ struct saved_alias *al;
+
+ if (slab_state == FULL) {
+ /*
+ * If we have a leftover link then remove it.
+ */
+ sysfs_remove_link(&slab_kset->kobj, name);
+ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
+ }
+
+ al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
+ if (!al)
+ return -ENOMEM;
+
+ al->s = s;
+ al->name = name;
+ al->next = alias_list;
+ alias_list = al;
+ return 0;
+}
+
+static int __init slab_sysfs_init(void)
+{
+ struct kmem_cache *s;
+ int err;
+
+ mutex_lock(&slab_mutex);
+
+ slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ if (!slab_kset) {
+ mutex_unlock(&slab_mutex);
+ pr_err("Cannot register slab subsystem.\n");
+ return -ENOSYS;
+ }
+
+ slab_state = FULL;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ err = sysfs_slab_add(s);
+ if (err)
+ pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
+ s->name);
+ }
+
+ while (alias_list) {
+ struct saved_alias *al = alias_list;
+
+ alias_list = alias_list->next;
+ err = sysfs_slab_alias(al->s, al->name);
+ if (err)
+ pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
+ al->name);
+ kfree(al);
+ }
+
+ mutex_unlock(&slab_mutex);
+ resiliency_test();
+ return 0;
+}
+
+__initcall(slab_sysfs_init);
+#endif /* CONFIG_SYSFS */
+
+/*
+ * The /proc/slabinfo ABI
+ */
+#ifdef CONFIG_SLABINFO
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
+{
+ unsigned long nr_slabs = 0;
+ unsigned long nr_objs = 0;
+ unsigned long nr_free = 0;
+ int node;
+
+ for_each_online_node(node) {
+ struct kmem_cache_node *n = get_node(s, node);
+
+ if (!n)
+ continue;
+
+ nr_slabs += node_nr_slabs(n);
+ nr_objs += node_nr_objs(n);
+ nr_free += count_partial(n, count_free);
+ }
+
+ sinfo->active_objs = nr_objs - nr_free;
+ sinfo->num_objs = nr_objs;
+ sinfo->active_slabs = nr_slabs;
+ sinfo->num_slabs = nr_slabs;
+ sinfo->objects_per_slab = oo_objects(s->oo);
+ sinfo->cache_order = oo_order(s->oo);
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
+{
+}
+
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return -EIO;
+}
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 00000000000..4cba9c2783a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,235 @@
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * The architecture is expected to provide a vmemmap_populate() function
+ * to instantiate the mapping.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+
+static void * __init_refok __earlyonly_bootmem_alloc(int node,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal)
+{
+ return memblock_virt_alloc_try_nid(size, align, goal,
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
+}
+
+static void *vmemmap_buf;
+static void *vmemmap_buf_end;
+
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+ /* If the main allocator is up use that, fallback to bootmem. */
+ if (slab_is_available()) {
+ struct page *page;
+
+ if (node_state(node, N_HIGH_MEMORY))
+ page = alloc_pages_node(
+ node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ get_order(size));
+ else
+ page = alloc_pages(
+ GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT,
+ get_order(size));
+ if (page)
+ return page_address(page);
+ return NULL;
+ } else
+ return __earlyonly_bootmem_alloc(node, size, size,
+ __pa(MAX_DMA_ADDRESS));
+}
+
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+ void *ptr;
+
+ if (!vmemmap_buf)
+ return vmemmap_alloc_block(size, node);
+
+ /* take the from buf */
+ ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
+ if (ptr + size > vmemmap_buf_end)
+ return vmemmap_alloc_block(size, node);
+
+ vmemmap_buf = ptr + size;
+
+ return ptr;
+}
+
+void __meminit vmemmap_verify(pte_t *pte, int node,
+ unsigned long start, unsigned long end)
+{
+ unsigned long pfn = pte_pfn(*pte);
+ int actual_node = early_pfn_to_nid(pfn);
+
+ if (node_distance(actual_node, node) > LOCAL_DISTANCE)
+ printk(KERN_WARNING "[%lx-%lx] potential offnode "
+ "page_structs\n", start, end - 1);
+}
+
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+ return pte;
+}
+
+pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+ return pmd;
+}
+
+pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+ pud_t *pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pud_populate(&init_mm, pud, p);
+ }
+ return pud;
+}
+
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return NULL;
+ pgd_populate(&init_mm, pgd, p);
+ }
+ return pgd;
+}
+
+int __meminit vmemmap_populate_basepages(unsigned long start,
+ unsigned long end, int node)
+{
+ unsigned long addr = start;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ for (; addr < end; addr += PAGE_SIZE) {
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+ pud = vmemmap_pud_populate(pgd, addr, node);
+ if (!pud)
+ return -ENOMEM;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+ if (!pmd)
+ return -ENOMEM;
+ pte = vmemmap_pte_populate(pmd, addr, node);
+ if (!pte)
+ return -ENOMEM;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+ unsigned long start;
+ unsigned long end;
+ struct page *map;
+
+ map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ start = (unsigned long)map;
+ end = (unsigned long)(map + PAGES_PER_SECTION);
+
+ if (vmemmap_populate(start, end, nid))
+ return NULL;
+
+ return map;
+}
+
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+ void *vmemmap_buf_start;
+
+ size = ALIGN(size, PMD_SIZE);
+ vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+ PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+
+ if (vmemmap_buf_start) {
+ vmemmap_buf = vmemmap_buf_start;
+ vmemmap_buf_end = vmemmap_buf_start + size * map_count;
+ }
+
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+
+ if (vmemmap_buf_start) {
+ /* need to free left buf */
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
+ vmemmap_buf = NULL;
+ vmemmap_buf_end = NULL;
+ }
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 86c52ab8087..d1b48b691ac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,13 +2,19 @@
* sparse memory mappings.
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/compiler.h>
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+
+#include "internal.h"
#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
/*
* Permanent SPARSEMEM data:
@@ -24,50 +30,68 @@ struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
#endif
EXPORT_SYMBOL(mem_section);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
+
+int page_to_nid(const struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+
+static void set_section_nid(unsigned long section_nr, int nid)
+{
+ section_to_node_table[section_nr] = nid;
+}
+#else /* !NODE_NOT_IN_PAGE_FLAGS */
+static inline void set_section_nid(unsigned long section_nr, int nid)
+{
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_EXTREME
-static struct mem_section *sparse_index_alloc(int nid)
+static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
{
struct mem_section *section = NULL;
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available())
- section = kmalloc_node(array_size, GFP_KERNEL, nid);
- else
- section = alloc_bootmem_node(NODE_DATA(nid), array_size);
-
- if (section)
- memset(section, 0, array_size);
+ if (slab_is_available()) {
+ if (node_state(nid, N_HIGH_MEMORY))
+ section = kzalloc_node(array_size, GFP_KERNEL, nid);
+ else
+ section = kzalloc(array_size, GFP_KERNEL);
+ } else {
+ section = memblock_virt_alloc_node(array_size, nid);
+ }
return section;
}
-static int sparse_index_init(unsigned long section_nr, int nid)
+static int __meminit sparse_index_init(unsigned long section_nr, int nid)
{
- static DEFINE_SPINLOCK(index_init_lock);
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
- int ret = 0;
if (mem_section[root])
return -EEXIST;
section = sparse_index_alloc(nid);
- /*
- * This lock keeps two different sections from
- * reallocating for the same index
- */
- spin_lock(&index_init_lock);
-
- if (mem_section[root]) {
- ret = -EEXIST;
- goto out;
- }
+ if (!section)
+ return -ENOMEM;
mem_section[root] = section;
-out:
- spin_unlock(&index_init_lock);
- return ret;
+
+ return 0;
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
@@ -78,7 +102,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
/*
* Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
*/
int __section_nr(struct mem_section* ms)
@@ -95,6 +119,8 @@ int __section_nr(struct mem_section* ms)
break;
}
+ VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
+
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
@@ -114,17 +140,45 @@ static inline int sparse_early_nid(struct mem_section *section)
return (section->section_mem_map >> SECTION_NID_SHIFT);
}
+/* Validate the physical addressing limitations of the model */
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+
+ /*
+ * Sanity checks - do not allow an architecture to pass
+ * in larger pfns than the maximum scope of sparsemem:
+ */
+ if (*start_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *start_pfn = max_sparsemem_pfn;
+ *end_pfn = max_sparsemem_pfn;
+ } else if (*end_pfn > max_sparsemem_pfn) {
+ mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+ "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+ *start_pfn, *end_pfn, max_sparsemem_pfn);
+ WARN_ON_ONCE(1);
+ *end_pfn = max_sparsemem_pfn;
+ }
+}
+
/* Record a memory area against a node. */
-void memory_present(int nid, unsigned long start, unsigned long end)
+void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
start &= PAGE_SECTION_MASK;
+ mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
unsigned long section = pfn_to_section_nr(pfn);
struct mem_section *ms;
sparse_index_init(section, nid);
+ set_section_nid(section, nid);
ms = __nr_to_section(section);
if (!ms->section_mem_map)
@@ -143,11 +197,12 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long pfn;
unsigned long nr_pages = 0;
+ mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
if (nid != early_pfn_to_nid(pfn))
continue;
- if (pfn_valid(pfn))
+ if (pfn_present(pfn))
nr_pages += PAGES_PER_SECTION;
}
@@ -165,53 +220,409 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
}
/*
- * We need this if we ever free the mem_maps. While not implemented yet,
- * this function is included for parity with its sibling.
+ * Decode mem_map from the coded memmap
*/
-static __attribute((unused))
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
+ /* mask off the extra low bits of information */
+ coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
-static int sparse_init_one_section(struct mem_section *ms,
- unsigned long pnum, struct page *mem_map)
+static int __meminit sparse_init_one_section(struct mem_section *ms,
+ unsigned long pnum, struct page *mem_map,
+ unsigned long *pageblock_bitmap)
{
- if (!valid_section(ms))
+ if (!present_section(ms))
return -EINVAL;
ms->section_mem_map &= ~SECTION_MAP_MASK;
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
+ SECTION_HAS_MEM_MAP;
+ ms->pageblock_flags = pageblock_bitmap;
return 1;
}
-static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+unsigned long usemap_size(void)
+{
+ unsigned long size_bytes;
+ size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
+ size_bytes = roundup(size_bytes, sizeof(unsigned long));
+ return size_bytes;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long *__kmalloc_section_usemap(void)
+{
+ return kmalloc(usemap_size(), GFP_KERNEL);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long size)
+{
+ unsigned long goal, limit;
+ unsigned long *p;
+ int nid;
+ /*
+ * A page may contain usemaps for other sections preventing the
+ * page being freed and making a section unremovable while
+ * other sections referencing the usemap remain active. Similarly,
+ * a pgdat can prevent a section being removed. If section A
+ * contains a pgdat and section B contains the usemap, both
+ * sections become inter-dependent. This allocates usemaps
+ * from the same section as the pgdat where possible to avoid
+ * this problem.
+ */
+ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ limit = goal + (1UL << PA_SECTION_SHIFT);
+ nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+ p = memblock_virt_alloc_try_nid_nopanic(size,
+ SMP_CACHE_BYTES, goal, limit,
+ nid);
+ if (!p && limit) {
+ limit = 0;
+ goto again;
+ }
+ return p;
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+ unsigned long usemap_snr, pgdat_snr;
+ static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+ static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int usemap_nid;
+
+ usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ if (usemap_snr == pgdat_snr)
+ return;
+
+ if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+ /* skip redundant message */
+ return;
+
+ old_usemap_snr = usemap_snr;
+ old_pgdat_snr = pgdat_snr;
+
+ usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+ if (usemap_nid != nid) {
+ printk(KERN_INFO
+ "node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
+ return;
+ }
+ /*
+ * There is a circular dependency.
+ * Some platforms allow un-removable section because they will just
+ * gather other removable sections for dynamic partitioning.
+ * Just notify un-removable section's number here.
+ */
+ printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+ pgdat_snr, nid);
+ printk(KERN_CONT
+ " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long size)
+{
+ return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+}
+
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+static void __init sparse_early_usemaps_alloc_node(void *data,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long usemap_count, int nodeid)
+{
+ void *usemap;
+ unsigned long pnum;
+ unsigned long **usemap_map = (unsigned long **)data;
+ int size = usemap_size();
+
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+ size * usemap_count);
+ if (!usemap) {
+ printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ return;
+ }
+
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ check_usemap_section_nr(nodeid, usemap_map[pnum]);
+ }
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
{
struct page *map;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
+ unsigned long size;
map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
if (map)
return map;
- map = alloc_bootmem_node(NODE_DATA(nid),
- sizeof(struct page) * PAGES_PER_SECTION);
+ size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+ map = memblock_virt_alloc_try_nid(size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ return map;
+}
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ void *map;
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+
+ map = alloc_remap(nodeid, size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ map = memblock_virt_alloc_try_nid(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ /* fallback */
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+static void __init sparse_early_mem_maps_alloc_node(void *data,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ struct page **map_map = (struct page **)data;
+ sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+ map_count, nodeid);
+}
+#else
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+ struct page *map;
+ struct mem_section *ms = __nr_to_section(pnum);
+ int nid = sparse_early_nid(ms);
+
+ map = sparse_mem_map_populate(pnum, nid);
if (map)
return map;
- printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
ms->section_mem_map = 0;
return NULL;
}
+#endif
+
+void __weak __meminit vmemmap_populate_print_last(void)
+{
+}
+
+/**
+ * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
+ * @map: usemap_map for pageblock flags or mmap_map for vmemmap
+ */
+static void __init alloc_usemap_and_memmap(void (*alloc_func)
+ (void *, unsigned long, unsigned long,
+ unsigned long, int), void *data)
+{
+ unsigned long pnum;
+ unsigned long map_count;
+ int nodeid_begin = 0;
+ unsigned long pnum_begin = 0;
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ map_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ map_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ alloc_func(data, pnum_begin, pnum,
+ map_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ map_count = 1;
+ }
+ /* ok, last chunk */
+ alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+ map_count, nodeid_begin);
+}
-static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+ unsigned long pnum;
+ struct page *map;
+ unsigned long *usemap;
+ unsigned long **usemap_map;
+ int size;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ int size2;
+ struct page **map_map;
+#endif
+
+ /* see include/linux/mmzone.h 'struct mem_section' definition */
+ BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+
+ /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+ set_pageblock_order();
+
+ /*
+ * map is using big page (aka 2M in x86 64 bit)
+ * usemap is less one page (aka 24 bytes)
+ * so alloc 2M (with 2M align) and 24 bytes in turn will
+ * make next 2M slip to one more 2M later.
+ * then in big system, the memory will have a lot of holes...
+ * here try to allocate 2M pages continuously.
+ *
+ * powerpc need to call sparse_init_one_section right after each
+ * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+ */
+ size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+ usemap_map = memblock_virt_alloc(size, 0);
+ if (!usemap_map)
+ panic("can not allocate usemap_map\n");
+ alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
+ (void *)usemap_map);
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+ map_map = memblock_virt_alloc(size2, 0);
+ if (!map_map)
+ panic("can not allocate map_map\n");
+ alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
+ (void *)map_map);
+#endif
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+
+ usemap = usemap_map[pnum];
+ if (!usemap)
+ continue;
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ map = map_map[pnum];
+#else
+ map = sparse_early_mem_map_alloc(pnum);
+#endif
+ if (!map)
+ continue;
+
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+ usemap);
+ }
+
+ vmemmap_populate_print_last();
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+ memblock_free_early(__pa(map_map), size2);
+#endif
+ memblock_free_early(__pa(usemap_map), size);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+{
+ /* This will make the necessary allocations eventually. */
+ return sparse_mem_map_populate(pnum, nid);
+}
+static void __kfree_section_memmap(struct page *memmap)
+{
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+
+ vmemmap_free(start, end);
+}
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void free_map_bootmem(struct page *memmap)
+{
+ unsigned long start = (unsigned long)memmap;
+ unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+
+ vmemmap_free(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#else
+static struct page *__kmalloc_section_memmap(void)
{
struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * nr_pages;
+ unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
- page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
+ page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
goto got_map_page;
@@ -223,60 +634,69 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:
- memset(ret, 0, memmap_size);
return ret;
}
-static int vaddr_in_vmalloc_area(void *addr)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
{
- if (addr >= (void *)VMALLOC_START &&
- addr < (void *)VMALLOC_END)
- return 1;
- return 0;
+ return __kmalloc_section_memmap();
}
-static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+static void __kfree_section_memmap(struct page *memmap)
{
- if (vaddr_in_vmalloc_area(memmap))
+ if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * nr_pages));
+ get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
-/*
- * Allocate the accumulated non-linear sections, allocate a mem_map
- * for each and record the physical to section mapping.
- */
-void sparse_init(void)
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void free_map_bootmem(struct page *memmap)
{
- unsigned long pnum;
- struct page *map;
-
- for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- if (!valid_section_nr(pnum))
- continue;
-
- map = sparse_early_mem_map_alloc(pnum);
- if (!map)
- continue;
- sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+ unsigned long maps_section_nr, removing_section_nr, i;
+ unsigned long magic, nr_pages;
+ struct page *page = virt_to_page(memmap);
+
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ for (i = 0; i < nr_pages; i++, page++) {
+ magic = (unsigned long) page->lru.next;
+
+ BUG_ON(magic == NODE_INFO);
+
+ maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+ removing_section_nr = page->private;
+
+ /*
+ * When this function is called, the removing section is
+ * logical offlined state. This means all pages are isolated
+ * from page allocator. If removing section's memmap is placed
+ * on the same section, it must not be freed.
+ * If it is freed, page allocator may allocate it which will
+ * be removed physically soon.
+ */
+ if (maps_section_nr != removing_section_nr)
+ put_page_bootmem(page);
}
}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
/*
* returns the number of sections whose mem_maps were properly
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
- int nr_pages)
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
struct mem_section *ms;
struct page *memmap;
+ unsigned long *usemap;
unsigned long flags;
int ret;
@@ -284,8 +704,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
* no locking for this, because it does its own
* plus, it does a kmalloc
*/
- sparse_index_init(section_nr, pgdat->node_id);
- memmap = __kmalloc_section_memmap(nr_pages);
+ ret = sparse_index_init(section_nr, pgdat->node_id);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
+ if (!memmap)
+ return -ENOMEM;
+ usemap = __kmalloc_section_usemap();
+ if (!usemap) {
+ __kfree_section_memmap(memmap);
+ return -ENOMEM;
+ }
pgdat_resize_lock(pgdat, &flags);
@@ -294,13 +723,89 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
ret = -EEXIST;
goto out;
}
+
+ memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
+
ms->section_mem_map |= SECTION_MARKED_PRESENT;
- ret = sparse_init_one_section(ms, section_nr, memmap);
+ ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
pgdat_resize_unlock(pgdat, &flags);
- if (ret <= 0)
- __kfree_section_memmap(memmap, nr_pages);
+ if (ret <= 0) {
+ kfree(usemap);
+ __kfree_section_memmap(memmap);
+ }
return ret;
}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+ int i;
+
+ if (!memmap)
+ return;
+
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ if (PageHWPoison(&memmap[i])) {
+ atomic_long_sub(1, &num_poisoned_pages);
+ ClearPageHWPoison(&memmap[i]);
+ }
+ }
+}
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+ struct page *usemap_page;
+
+ if (!usemap)
+ return;
+
+ usemap_page = virt_to_page(usemap);
+ /*
+ * Check to see if allocation came from hot-plug-add
+ */
+ if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
+ kfree(usemap);
+ if (memmap)
+ __kfree_section_memmap(memmap);
+ return;
+ }
+
+ /*
+ * The usemap came from bootmem. This is packed with other usemaps
+ * on the section which has pgdat at boot time. Just keep it as is now.
+ */
+
+ if (memmap)
+ free_map_bootmem(memmap);
+}
+
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+{
+ struct page *memmap = NULL;
+ unsigned long *usemap = NULL, flags;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+
+ pgdat_resize_lock(pgdat, &flags);
+ if (ms->section_mem_map) {
+ usemap = ms->pageblock_flags;
+ memmap = sparse_decode_mem_map(ms->section_mem_map,
+ __section_nr(ms));
+ ms->section_mem_map = 0;
+ ms->pageblock_flags = NULL;
+ }
+ pgdat_resize_unlock(pgdat, &flags);
+
+ clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+ free_section_usemap(memmap, usemap);
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 2e0e871f542..9e8e3472248 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -5,7 +5,7 @@
*/
/*
- * This file contains the default values for the opereation of the
+ * This file contains the default values for the operation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
* Documentation/sysctl/vm.txt.
* Started 18.12.91
@@ -21,47 +21,243 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm_inline.h>
-#include <linux/buffer_head.h> /* for try_to_release_page() */
-#include <linux/module.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
+#include <linux/gfp.h>
+#include <linux/uio.h>
+
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagemap.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
*/
-static void fastcall __page_cache_release(struct page *page)
+static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- unsigned long flags;
struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+ unsigned long flags;
spin_lock_irqsave(&zone->lru_lock, flags);
- VM_BUG_ON(!PageLRU(page));
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
- del_page_from_lru(zone, page);
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
- free_hot_page(page);
+}
+
+static void __put_single_page(struct page *page)
+{
+ __page_cache_release(page);
+ free_hot_cold_page(page, false);
+}
+
+static void __put_compound_page(struct page *page)
+{
+ compound_page_dtor *dtor;
+
+ __page_cache_release(page);
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+}
+
+/**
+ * Two special cases here: we could avoid taking compound_lock_irqsave
+ * and could skip the tail refcounting(in _mapcount).
+ *
+ * 1. Hugetlbfs page:
+ *
+ * PageHeadHuge will remain true until the compound page
+ * is released and enters the buddy allocator, and it could
+ * not be split by __split_huge_page_refcount().
+ *
+ * So if we see PageHeadHuge set, and we have the tail page pin,
+ * then we could safely put head page.
+ *
+ * 2. Slab THP page:
+ *
+ * PG_slab is cleared before the slab frees the head page, and
+ * tail pin cannot be the last reference left on the head page,
+ * because the slab code is free to reuse the compound page
+ * after a kfree/kmem_cache_free without having to check if
+ * there's any tail pin left. In turn all tail pinsmust be always
+ * released while the head is still pinned by the slab code
+ * and so we know PG_slab will be still set too.
+ *
+ * So if we see PageSlab set, and we have the tail page pin,
+ * then we could safely put head page.
+ */
+static __always_inline
+void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
+{
+ /*
+ * If @page is a THP tail, we must read the tail page
+ * flags after the head page flags. The
+ * __split_huge_page_refcount side enforces write memory barriers
+ * between clearing PageTail and before the head page
+ * can be freed and reallocated.
+ */
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount cannot race
+ * here, see the comment above this function.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
+ if (put_page_testzero(page_head)) {
+ /*
+ * If this is the tail of a slab THP page,
+ * the tail pin must not be the last reference
+ * held on the page, because the PG_slab cannot
+ * be cleared before all tail pins (which skips
+ * the _mapcount tail refcounting) have been
+ * released.
+ *
+ * If this is the tail of a hugetlbfs page,
+ * the tail pin may be the last reference on
+ * the page instead, because PageHeadHuge will
+ * not go away until the compound page enters
+ * the buddy allocator.
+ */
+ VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
+ __put_compound_page(page_head);
+ }
+ } else
+ /*
+ * __split_huge_page_refcount run before us,
+ * @page was a THP tail. The split @page_head
+ * has been freed and reallocated as slab or
+ * hugetlbfs page of smaller order (only
+ * possible if reallocated as slab on x86).
+ */
+ if (put_page_testzero(page))
+ __put_single_page(page);
+}
+
+static __always_inline
+void put_refcounted_compound_page(struct page *page_head, struct page *page)
+{
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+
+ /*
+ * @page_head wasn't a dangling pointer but it may not
+ * be a head page anymore by the time we obtain the
+ * lock. That is ok as long as it can't be freed from
+ * under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ if (put_page_testzero(page_head)) {
+ /*
+ * The @page_head may have been freed
+ * and reallocated as a compound page
+ * of smaller order and then freed
+ * again. All we know is that it
+ * cannot have become: a THP page, a
+ * compound page of higher order, a
+ * tail page. That is because we
+ * still hold the refcount of the
+ * split THP tail and page_head was
+ * the THP head before the split.
+ */
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON_PAGE(page_head != page->first_page, page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON_PAGE(1, page_head);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+ compound_unlock_irqrestore(page_head, flags);
+
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* @page_head is a dangling pointer */
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ goto out_put_single;
+ }
}
static void put_compound_page(struct page *page)
{
- page = (struct page *)page_private(page);
- if (put_page_testzero(page)) {
- void (*dtor)(struct page *page);
+ struct page *page_head;
- dtor = (void (*)(struct page *))page[1].lru.next;
- (*dtor)(page);
+ /*
+ * We see the PageCompound set and PageTail not set, so @page maybe:
+ * 1. hugetlbfs head page, or
+ * 2. THP head page.
+ */
+ if (likely(!PageTail(page))) {
+ if (put_page_testzero(page)) {
+ /*
+ * By the time all refcounts have been released
+ * split_huge_page cannot run anymore from under us.
+ */
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+ }
+ return;
}
+
+ /*
+ * We see the PageCompound set and PageTail set, so @page maybe:
+ * 1. a tail hugetlbfs page, or
+ * 2. a tail THP page, or
+ * 3. a split THP page.
+ *
+ * Case 3 is possible, as we may race with
+ * __split_huge_page_refcount tearing down a THP page.
+ */
+ page_head = compound_head_by_tail(page);
+ if (!__compound_tail_refcounted(page_head))
+ put_unrefcounted_compound_page(page_head, page);
+ else
+ put_refcounted_compound_page(page_head, page);
}
void put_page(struct page *page)
@@ -69,17 +265,82 @@ void put_page(struct page *page)
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
- __page_cache_release(page);
+ __put_single_page(page);
}
EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+ /*
+ * This takes care of get_page() if run on a tail page
+ * returned by one of the get_user_pages/follow_page variants.
+ * get_user_pages/follow_page itself doesn't need the compound
+ * lock because it runs __get_page_tail_foll() under the
+ * proper PT lock that already serializes against
+ * split_huge_page().
+ */
+ unsigned long flags;
+ bool got;
+ struct page *page_head = compound_head(page);
+
+ /* Ref to put_compound_page() comment. */
+ if (!__compound_tail_refcounted(page_head)) {
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ __get_page_tail_foll(page, true);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ return false;
+ }
+ }
+
+ got = false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ /* here __split_huge_page_refcount won't run anymore */
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ got = true;
+ }
+ compound_unlock_irqrestore(page_head, flags);
+ if (unlikely(!got))
+ put_page(page_head);
+ }
+ return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
+
/**
- * put_pages_list(): release a list of pages
+ * put_pages_list() - release a list of pages
+ * @pages: list of pages threaded on page->lru
*
* Release a list of pages which are strung together on page.lru. Currently
* used by read_cache_pages() and related error recovery code.
- *
- * @pages: list of pages threaded on page->lru
*/
void put_pages_list(struct list_head *pages)
{
@@ -94,62 +355,233 @@ void put_pages_list(struct list_head *pages)
EXPORT_SYMBOL(put_pages_list);
/*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim. If it still appears to be reclaimable, move it to the tail of the
- * inactive list. The page still has PageWriteback set, which will pin it.
- *
- * We don't expect many pages to come through here, so don't bother batching
- * things up.
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov: An array of struct kvec structures
+ * @nr_segs: number of segments to pin
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_segs long.
*
- * To avoid placing the page at the tail of the LRU while PG_writeback is still
- * set, this function will clear PG_writeback before performing the page
- * motion. Do that inside the lru lock because once PG_writeback is cleared
- * we may not touch the page.
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+ struct page **pages)
+{
+ int seg;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+ return seg;
+
+ pages[seg] = kmap_to_page(kiov[seg].iov_base);
+ page_cache_get(pages[seg]);
+ }
+
+ return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start: starting kernel address
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointer to the page pinned.
+ * Must be at least nr_segs long.
*
- * Returns zero if it cleared PG_writeback.
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
*/
-int rotate_reclaimable_page(struct page *page)
+int get_kernel_page(unsigned long start, int write, struct page **pages)
{
- struct zone *zone;
- unsigned long flags;
+ const struct kvec kiov = {
+ .iov_base = (void *)start,
+ .iov_len = PAGE_SIZE
+ };
- if (PageLocked(page))
- return 1;
- if (PageDirty(page))
- return 1;
- if (PageActive(page))
- return 1;
- if (!PageLRU(page))
- return 1;
+ return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
- zone = page_zone(page);
- spin_lock_irqsave(&zone->lru_lock, flags);
- if (PageLRU(page) && !PageActive(page)) {
- list_move_tail(&page->lru, &zone->inactive_list);
- __count_vm_event(PGROTATED);
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+ void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
+ void *arg)
+{
+ int i;
+ struct zone *zone = NULL;
+ struct lruvec *lruvec;
+ unsigned long flags = 0;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ struct zone *pagezone = page_zone(page);
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = pagezone;
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ (*move_fn)(page, lruvec, arg);
+ }
+ if (zone)
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ release_pages(pvec->pages, pvec->nr, pvec->cold);
+ pagevec_reinit(pvec);
+}
+
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ int *pgmoved = arg;
+
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ enum lru_list lru = page_lru_base_type(page);
+ list_move_tail(&page->lru, &lruvec->lists[lru]);
+ (*pgmoved)++;
}
- if (!test_clear_page_writeback(page))
- BUG();
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- return 0;
}
/*
- * FIXME: speed this up?
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
*/
-void fastcall activate_page(struct page *page)
+static void pagevec_move_tail(struct pagevec *pvec)
{
- struct zone *zone = page_zone(page);
+ int pgmoved = 0;
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && !PageActive(page)) {
- del_page_from_inactive_list(zone, page);
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+ __count_vm_events(PGROTATED, pgmoved);
+}
+
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim. If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void rotate_reclaimable_page(struct page *page)
+{
+ if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+ !PageUnevictable(page) && PageLRU(page)) {
+ struct pagevec *pvec;
+ unsigned long flags;
+
+ page_cache_get(page);
+ local_irq_save(flags);
+ pvec = this_cpu_ptr(&lru_rotate_pvecs);
+ if (!pagevec_add(pvec, page))
+ pagevec_move_tail(pvec);
+ local_irq_restore(flags);
+ }
+}
+
+static void update_page_reclaim_stat(struct lruvec *lruvec,
+ int file, int rotated)
+{
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+
+ reclaim_stat->recent_scanned[file]++;
+ if (rotated)
+ reclaim_stat->recent_rotated[file]++;
+}
+
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru);
SetPageActive(page);
- add_page_to_active_list(zone, page);
+ lru += LRU_ACTIVE;
+ add_page_to_lru_list(page, lruvec, lru);
+ trace_mm_lru_activate(page, page_to_pfn(page));
+
__count_vm_event(PGACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 1);
}
+}
+
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+
+static void activate_page_drain(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+
+static bool need_activate_page_drain(int cpu)
+{
+ return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
+void activate_page(struct page *page)
+{
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ put_cpu_var(activate_page_pvecs);
+ }
+}
+
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+
+static bool need_activate_page_drain(int cpu)
+{
+ return false;
+}
+
+void activate_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+
+ spin_lock_irq(&zone->lru_lock);
+ __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
spin_unlock_irq(&zone->lru_lock);
}
+#endif
+
+static void __lru_cache_activate_page(struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ int i;
+
+ /*
+ * Search backwards on the optimistic assumption that the page being
+ * activated has just been added to this pagevec. Note that only
+ * the local pagevec is examined as a !PageLRU page could be in the
+ * process of being released, reclaimed, migrated or on a remote
+ * pagevec that is currently being drained. Furthermore, marking
+ * a remote pagevec's page PageActive potentially hits a race where
+ * a page is marked PageActive just after it is added to the inactive
+ * list causing accounting errors and BUG_ON checks to trigger.
+ */
+ for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
+ struct page *pagevec_page = pvec->pages[i];
+
+ if (pagevec_page == page) {
+ SetPageActive(page);
+ break;
+ }
+ }
+
+ put_cpu_var(lru_add_pvec);
+}
/*
* Mark a page as having seen activity.
@@ -158,88 +590,275 @@ void fastcall activate_page(struct page *page)
* inactive,referenced -> active,unreferenced
* active,unreferenced -> active,referenced
*/
-void fastcall mark_page_accessed(struct page *page)
+void mark_page_accessed(struct page *page)
{
- if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
- activate_page(page);
+ if (!PageActive(page) && !PageUnevictable(page) &&
+ PageReferenced(page)) {
+
+ /*
+ * If the page is on the LRU, queue it for activation via
+ * activate_page_pvecs. Otherwise, assume the page is on a
+ * pagevec, mark it active and it'll be moved to the active
+ * LRU on the next drain.
+ */
+ if (PageLRU(page))
+ activate_page(page);
+ else
+ __lru_cache_activate_page(page);
ClearPageReferenced(page);
+ if (page_is_file_cache(page))
+ workingset_activation(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
}
-
EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
+ * still safe to use non-atomic ops
+ */
+void init_page_accessed(struct page *page)
+{
+ if (!PageReferenced(page))
+ __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+
+static void __lru_cache_add(struct page *page)
+{
+ struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+
+ page_cache_get(page);
+ if (!pagevec_space(pvec))
+ __pagevec_lru_add(pvec);
+ pagevec_add(pvec, page);
+ put_cpu_var(lru_add_pvec);
+}
+
/**
* lru_cache_add: add a page to the page lists
* @page: the page to add
*/
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+void lru_cache_add_anon(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
-void fastcall lru_cache_add(struct page *page)
+void lru_cache_add_file(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
- page_cache_get(page);
- if (!pagevec_add(pvec, page))
- __pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvecs);
+/**
+ * lru_cache_add - add a page to a page list
+ * @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
+ */
+void lru_cache_add(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ __lru_cache_add(page);
}
-void fastcall lru_cache_add_active(struct page *page)
+/**
+ * add_page_to_unevictable_list - add a page to the unevictable list
+ * @page: the page to be added to the unevictable list
+ *
+ * Add page directly to its zone's unevictable list. To avoid races with
+ * tasks that might be making the page evictable, through eg. munlock,
+ * munmap or exit, while it's not on the lru, we want to add the page
+ * while it's locked or otherwise "invisible" to other tasks. This is
+ * difficult to do when using the pagevec cache, so bypass that.
+ */
+void add_page_to_unevictable_list(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+ struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
- page_cache_get(page);
- if (!pagevec_add(pvec, page))
- __pagevec_lru_add_active(pvec);
- put_cpu_var(lru_add_active_pvecs);
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ spin_unlock_irq(&zone->lru_lock);
}
-static void __lru_add_drain(int cpu)
+/*
+ * If the page can not be invalidated, it is moved to the
+ * inactive list to speed up its reclaim. It is moved to the
+ * head of the list, rather than the tail, to give the flusher
+ * threads some time to write it out, as this is much more
+ * effective than the single-page writeout from reclaim.
+ *
+ * If the page isn't page_mapped and dirty/writeback, the page
+ * could reclaim asap using PG_reclaim.
+ *
+ * 1. active, mapped page -> none
+ * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
+ * 3. inactive, mapped page -> none
+ * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 5. inactive, clean -> inactive, tail
+ * 6. Others -> none
+ *
+ * In 4, why it moves inactive's head, the VM expects the page would
+ * be write it out by flusher threads as this is much more effective
+ * than the single-page writeout from reclaim.
+ */
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
{
- struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+ int lru, file;
+ bool active;
+
+ if (!PageLRU(page))
+ return;
+
+ if (PageUnevictable(page))
+ return;
+
+ /* Some processes are using the page */
+ if (page_mapped(page))
+ return;
+
+ active = PageActive(page);
+ file = page_is_file_cache(page);
+ lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ if (PageWriteback(page) || PageDirty(page)) {
+ /*
+ * PG_reclaim could be raced with end_page_writeback
+ * It can make readahead confusing. But race window
+ * is _really_ small and it's non-critical problem.
+ */
+ SetPageReclaim(page);
+ } else {
+ /*
+ * The page's writeback ends up during pagevec
+ * We moves tha page into tail of inactive.
+ */
+ list_move_tail(&page->lru, &lruvec->lists[lru]);
+ __count_vm_event(PGROTATED);
+ }
+
+ if (active)
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+}
+
+/*
+ * Drain pages out of the cpu's pagevecs.
+ * Either "cpu" is the current CPU, and preemption has already been
+ * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ */
+void lru_add_drain_cpu(int cpu)
+{
+ struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
- /* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_add_active_pvecs, cpu);
+
+ pvec = &per_cpu(lru_rotate_pvecs, cpu);
+ if (pagevec_count(pvec)) {
+ unsigned long flags;
+
+ /* No harm done if a racing interrupt already did this */
+ local_irq_save(flags);
+ pagevec_move_tail(pvec);
+ local_irq_restore(flags);
+ }
+
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
+ activate_page_drain(cpu);
+}
+
+/**
+ * deactivate_page - forcefully deactivate a page
+ * @page: page to deactivate
+ *
+ * This function hints the VM that @page is a good reclaim candidate,
+ * for example if its invalidation fails due to the page being dirty
+ * or under writeback.
+ */
+void deactivate_page(struct page *page)
+{
+ /*
+ * In a workload with many unevictable page such as mprotect, unevictable
+ * page deactivation for accelerating reclaim is pointless.
+ */
+ if (PageUnevictable(page))
+ return;
+
+ if (likely(get_page_unless_zero(page))) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
}
void lru_add_drain(void)
{
- __lru_add_drain(get_cpu());
+ lru_add_drain_cpu(get_cpu());
put_cpu();
}
-#ifdef CONFIG_NUMA
-static void lru_add_drain_per_cpu(void *dummy)
+static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+void lru_add_drain_all(void)
{
- return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-}
+ static DEFINE_MUTEX(lock);
+ static struct cpumask has_work;
+ int cpu;
+
+ mutex_lock(&lock);
+ get_online_cpus();
+ cpumask_clear(&has_work);
+
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+ pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ need_activate_page_drain(cpu)) {
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ schedule_work_on(cpu, work);
+ cpumask_set_cpu(cpu, &has_work);
+ }
+ }
-#else
+ for_each_cpu(cpu, &has_work)
+ flush_work(&per_cpu(lru_add_drain_work, cpu));
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
-{
- lru_add_drain();
- return 0;
+ put_online_cpus();
+ mutex_unlock(&lock);
}
-#endif
/*
* Batched page_cache_release(). Decrement the reference count on all the
@@ -249,23 +868,25 @@ int lru_add_drain_all(void)
* Avoid taking zone->lru_lock if possible, but if it is taken, retain it
* for the remainder of the operation.
*
- * The locking in this function is against shrink_cache(): we recheck the
- * page count inside the lock to see whether shrink_cache grabbed the page
- * via the LRU. If it did, give up: shrink_cache will free it.
+ * The locking in this function is against shrink_inactive_list(): we recheck
+ * the page count inside the lock to see whether shrink_inactive_list()
+ * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
+ * will free it.
*/
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
{
int i;
- struct pagevec pages_to_free;
+ LIST_HEAD(pages_to_free);
struct zone *zone = NULL;
+ struct lruvec *lruvec;
+ unsigned long uninitialized_var(flags);
- pagevec_init(&pages_to_free, cold);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
if (unlikely(PageCompound(page))) {
if (zone) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
zone = NULL;
}
put_compound_page(page);
@@ -277,31 +898,32 @@ void release_pages(struct page **pages, int nr, int cold)
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
+
if (pagezone != zone) {
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock,
+ flags);
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irqsave(&zone->lru_lock, flags);
}
- VM_BUG_ON(!PageLRU(page));
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
- del_page_from_lru(zone, page);
+ del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
- if (!pagevec_add(&pages_to_free, page)) {
- if (zone) {
- spin_unlock_irq(&zone->lru_lock);
- zone = NULL;
- }
- __pagevec_free(&pages_to_free);
- pagevec_reinit(&pages_to_free);
- }
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+
+ list_add(&page->lru, &pages_to_free);
}
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
- pagevec_free(&pages_to_free);
+ free_hot_cold_page_list(&pages_to_free, cold);
}
+EXPORT_SYMBOL(release_pages);
/*
* The pages which we're about to release may be in the deferred lru-addition
@@ -319,29 +941,62 @@ void __pagevec_release(struct pagevec *pvec)
release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
pagevec_reinit(pvec);
}
-
EXPORT_SYMBOL(__pagevec_release);
-/*
- * pagevec_release() for pages which are known to not be on the LRU
- *
- * This function reinitialises the caller's pagevec.
- */
-void __pagevec_release_nonlru(struct pagevec *pvec)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct page *page, struct page *page_tail,
+ struct lruvec *lruvec, struct list_head *list)
{
- int i;
- struct pagevec pages_to_free;
+ const int file = 0;
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
+ VM_BUG_ON(NR_CPUS != 1 &&
+ !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
+
+ if (!list)
+ SetPageLRU(page_tail);
+
+ if (likely(PageLRU(page)))
+ list_add_tail(&page_tail->lru, &page->lru);
+ else if (list) {
+ /* page reclaim is reclaiming a huge page */
+ get_page(page_tail);
+ list_add_tail(&page_tail->lru, list);
+ } else {
+ struct list_head *list_head;
+ /*
+ * Head page has not yet been counted, as an hpage,
+ * so we must account for each subpage individually.
+ *
+ * Use the standard add function to put page_tail on the list,
+ * but then correct its position so they all end up in order.
+ */
+ add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
+ list_head = page_tail->lru.prev;
+ list_move_tail(&page_tail->lru, list_head);
+ }
- pagevec_init(&pages_to_free, pvec->cold);
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
+ if (!PageUnevictable(page))
+ update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- VM_BUG_ON(PageLRU(page));
- if (put_page_testzero(page))
- pagevec_add(&pages_to_free, page);
- }
- pagevec_free(&pages_to_free);
- pagevec_reinit(pvec);
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ int file = page_is_file_cache(page);
+ int active = PageActive(page);
+ enum lru_list lru = page_lru(page);
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec, lru);
+ update_page_reclaim_stat(lruvec, file, active);
+ trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
}
/*
@@ -350,74 +1005,59 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
*/
void __pagevec_lru_add(struct pagevec *pvec)
{
- int i;
- struct zone *zone = NULL;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- add_page_to_inactive_list(zone, page);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
+ pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}
-
EXPORT_SYMBOL(__pagevec_lru_add);
-void __pagevec_lru_add_active(struct pagevec *pvec)
+/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of entries
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_entries pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages,
+ pgoff_t *indices)
{
- int i;
- struct zone *zone = NULL;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- VM_BUG_ON(PageActive(page));
- SetPageActive(page);
- add_page_to_active_list(zone, page);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
+ pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
}
-/*
- * Try to drop buffers from the pages in a pagevec
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
*/
-void pagevec_strip(struct pagevec *pvec)
+void pagevec_remove_exceptionals(struct pagevec *pvec)
{
- int i;
+ int i, j;
- for (i = 0; i < pagevec_count(pvec); i++) {
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
-
- if (PagePrivate(page) && !TestSetPageLocked(page)) {
- if (PagePrivate(page))
- try_to_release_page(page, 0);
- unlock_page(page);
- }
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
}
+ pvec->nr = j;
}
/**
@@ -442,7 +1082,6 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
-
EXPORT_SYMBOL(pagevec_lookup);
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
@@ -452,58 +1091,24 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
nr_pages, pvec->pages);
return pagevec_count(pvec);
}
-
EXPORT_SYMBOL(pagevec_lookup_tag);
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
-
-static DEFINE_PER_CPU(long, committed_space) = 0;
-
-void vm_acct_memory(long pages)
-{
- long *local;
-
- preempt_disable();
- local = &__get_cpu_var(committed_space);
- *local += pages;
- if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
- atomic_add(*local, &vm_committed_space);
- *local = 0;
- }
- preempt_enable();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- long *committed;
-
- committed = &per_cpu(committed_space, (long)hcpu);
- if (action == CPU_DEAD) {
- atomic_add(*committed, &vm_committed_space);
- *committed = 0;
- __lru_add_drain((long)hcpu);
- }
- return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
-
/*
* Perform any setup for the swap system
*/
void __init swap_setup(void)
{
- unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+ unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+#ifdef CONFIG_SWAP
+ int i;
+
+ if (bdi_init(swapper_spaces[0].backing_dev_info))
+ panic("Failed to init swap bdi");
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ spin_lock_init(&swapper_spaces[i].tree_lock);
+ INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
+ }
+#endif
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
@@ -514,5 +1119,4 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
- hotcpu_notifier(cpu_swap_callback, 0);
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f7cf2a4cb5..2972eee184a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,42 +6,42 @@
*
* Rewritten to use page cache, (C) 1998 Stephen Tweedie
*/
-#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
+#include <linux/page_cgroup.h>
#include <asm/pgtable.h>
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_list, to make sync_page look nicer, and to allow
- * future use of radix_tree tags in the swap cache.
+ * vmscan's shrink_page_list.
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .sync_page = block_sync_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .set_page_dirty = swap_set_page_dirty,
.migratepage = migrate_page,
};
static struct backing_dev_info swap_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
- .unplug_io_fn = swap_unplug_io_fn,
+ .name = "swap",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
};
-struct address_space swapper_space = {
- .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
- .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
- .a_ops = &swap_aops,
- .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
- .backing_dev_info = &swap_backing_dev_info,
+struct address_space swapper_spaces[MAX_SWAPFILES] = {
+ [0 ... MAX_SWAPFILES - 1] = {
+ .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+ .a_ops = &swap_aops,
+ .backing_dev_info = &swap_backing_dev_info,
+ }
};
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -51,70 +51,85 @@ static struct {
unsigned long del_total;
unsigned long find_success;
unsigned long find_total;
- unsigned long noent_race;
- unsigned long exist_race;
} swap_cache_info;
+unsigned long total_swapcache_pages(void)
+{
+ int i;
+ unsigned long ret = 0;
+
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ ret += swapper_spaces[i].nrpages;
+ return ret;
+}
+
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
void show_swap_cache_info(void)
{
- printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+ printk("%lu pages in swap cache\n", total_swapcache_pages());
+ printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
swap_cache_info.add_total, swap_cache_info.del_total,
- swap_cache_info.find_success, swap_cache_info.find_total,
- swap_cache_info.noent_race, swap_cache_info.exist_race);
- printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ swap_cache_info.find_success, swap_cache_info.find_total);
+ printk("Free swap = %ldkB\n",
+ get_nr_swap_pages() << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
/*
- * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
- gfp_t gfp_mask)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
+ struct address_space *address_space;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+
+ page_cache_get(page);
+ SetPageSwapCache(page);
+ set_page_private(page, entry.val);
+
+ address_space = swap_address_space(entry);
+ spin_lock_irq(&address_space->tree_lock);
+ error = radix_tree_insert(&address_space->page_tree,
+ entry.val, page);
+ if (likely(!error)) {
+ address_space->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ INC_CACHE_INFO(add_total);
+ }
+ spin_unlock_irq(&address_space->tree_lock);
- BUG_ON(PageSwapCache(page));
- BUG_ON(PagePrivate(page));
- error = radix_tree_preload(gfp_mask);
- if (!error) {
- write_lock_irq(&swapper_space.tree_lock);
- error = radix_tree_insert(&swapper_space.page_tree,
- entry.val, page);
- if (!error) {
- page_cache_get(page);
- SetPageLocked(page);
- SetPageSwapCache(page);
- set_page_private(page, entry.val);
- total_swapcache_pages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- }
- write_unlock_irq(&swapper_space.tree_lock);
- radix_tree_preload_end();
+ if (unlikely(error)) {
+ /*
+ * Only the context which have set SWAP_HAS_CACHE flag
+ * would call add_to_swap_cache().
+ * So add_to_swap_cache() doesn't returns -EEXIST.
+ */
+ VM_BUG_ON(error == -EEXIST);
+ set_page_private(page, 0UL);
+ ClearPageSwapCache(page);
+ page_cache_release(page);
}
+
return error;
}
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
+
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
- if (!swap_duplicate(entry)) {
- INC_CACHE_INFO(noent_race);
- return -ENOENT;
- }
- error = __add_to_swap_cache(page, entry, GFP_KERNEL);
- /*
- * Anon pages are already on the LRU, we don't run lru_cache_add here.
- */
- if (error) {
- swap_free(entry);
- if (error == -EEXIST)
- INC_CACHE_INFO(exist_race);
- return error;
+ error = radix_tree_maybe_preload(gfp_mask);
+ if (!error) {
+ error = __add_to_swap_cache(page, entry);
+ radix_tree_preload_end();
}
- INC_CACHE_INFO(add_total);
- return 0;
+ return error;
}
/*
@@ -123,15 +138,19 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
*/
void __delete_from_swap_cache(struct page *page)
{
- BUG_ON(!PageLocked(page));
- BUG_ON(!PageSwapCache(page));
- BUG_ON(PageWriteback(page));
- BUG_ON(PagePrivate(page));
+ swp_entry_t entry;
+ struct address_space *address_space;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
- radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ entry.val = page_private(page);
+ address_space = swap_address_space(entry);
+ radix_tree_delete(&address_space->page_tree, page_private(page));
set_page_private(page, 0);
ClearPageSwapCache(page);
- total_swapcache_pages--;
+ address_space->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
}
@@ -143,48 +162,48 @@ void __delete_from_swap_cache(struct page *page)
* Allocate swap space for the page and add the page to the
* swap cache. Caller needs to hold the page lock.
*/
-int add_to_swap(struct page * page, gfp_t gfp_mask)
+int add_to_swap(struct page *page, struct list_head *list)
{
swp_entry_t entry;
int err;
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ entry = get_swap_page();
+ if (!entry.val)
+ return 0;
- for (;;) {
- entry = get_swap_page();
- if (!entry.val)
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page_to_list(page, list))) {
+ swapcache_free(entry, NULL);
return 0;
+ }
+ /*
+ * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * completely exhaust the page allocator. __GFP_NOMEMALLOC
+ * stops emergency reserves from being allocated.
+ *
+ * TODO: this could cause a theoretical memory reclaim
+ * deadlock in the swap out path.
+ */
+ /*
+ * Add it to the swap cache and mark it dirty
+ */
+ err = add_to_swap_cache(page, entry,
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+
+ if (!err) { /* Success */
+ SetPageDirty(page);
+ return 1;
+ } else { /* -ENOMEM radix-tree allocation failure */
/*
- * Radix-tree node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache and mark it dirty
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
*/
- err = __add_to_swap_cache(page, entry,
- gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
-
- switch (err) {
- case 0: /* Success */
- SetPageUptodate(page);
- SetPageDirty(page);
- INC_CACHE_INFO(add_total);
- return 1;
- case -EEXIST:
- /* Raced with "speculative" read_swap_cache_async */
- INC_CACHE_INFO(exist_race);
- swap_free(entry);
- continue;
- default:
- /* -ENOMEM radix-tree allocation failure */
- swap_free(entry);
- return 0;
- }
+ swapcache_free(entry, NULL);
+ return 0;
}
}
@@ -197,63 +216,31 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
+ struct address_space *address_space;
entry.val = page_private(page);
- write_lock_irq(&swapper_space.tree_lock);
+ address_space = swap_address_space(entry);
+ spin_lock_irq(&address_space->tree_lock);
__delete_from_swap_cache(page);
- write_unlock_irq(&swapper_space.tree_lock);
+ spin_unlock_irq(&address_space->tree_lock);
- swap_free(entry);
+ swapcache_free(entry, page);
page_cache_release(page);
}
-/*
- * Strange swizzling function only for use by shmem_writepage
- */
-int move_to_swap_cache(struct page *page, swp_entry_t entry)
-{
- int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
- if (!err) {
- remove_from_page_cache(page);
- page_cache_release(page); /* pagecache ref */
- if (!swap_duplicate(entry))
- BUG();
- SetPageDirty(page);
- INC_CACHE_INFO(add_total);
- } else if (err == -EEXIST)
- INC_CACHE_INFO(exist_race);
- return err;
-}
-
-/*
- * Strange swizzling function for shmem_getpage (and shmem_unuse)
- */
-int move_from_swap_cache(struct page *page, unsigned long index,
- struct address_space *mapping)
-{
- int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
- if (!err) {
- delete_from_swap_cache(page);
- /* shift page from clean_pages to dirty_pages list */
- ClearPageDirty(page);
- set_page_dirty(page);
- }
- return err;
-}
-
/*
* If we are the only user, then try to free up the swap cache.
*
* Its ok to check for PageSwapCache without the page lock
- * here because we are going to recheck again inside
- * exclusive_swap_page() _with_ the lock.
+ * here because we are going to recheck again inside
+ * try_to_free_swap() _with_ the lock.
* - Marcelo
*/
static inline void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !TestSetPageLocked(page)) {
- remove_exclusive_swap_page(page);
+ if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
+ try_to_free_swap(page);
unlock_page(page);
}
}
@@ -283,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, todo, 0);
+ release_pages(pagep, todo, false);
pagep += todo;
nr -= todo;
}
@@ -299,10 +286,13 @@ struct page * lookup_swap_cache(swp_entry_t entry)
{
struct page *page;
- page = find_get_page(&swapper_space, entry.val);
+ page = find_get_page(swap_address_space(entry), entry.val);
- if (page)
+ if (page) {
INC_CACHE_INFO(find_success);
+ if (TestClearPageReadahead(page))
+ atomic_inc(&swapin_readahead_hits);
+ }
INC_CACHE_INFO(find_total);
return page;
@@ -314,7 +304,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
*/
-struct page *read_swap_cache_async(swp_entry_t entry,
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *found_page, *new_page = NULL;
@@ -326,7 +316,8 @@ struct page *read_swap_cache_async(swp_entry_t entry,
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(&swapper_space, entry.val);
+ found_page = find_get_page(swap_address_space(entry),
+ entry.val);
if (found_page)
break;
@@ -334,33 +325,172 @@ struct page *read_swap_cache_async(swp_entry_t entry,
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ new_page = alloc_page_vma(gfp_mask, vma, addr);
if (!new_page)
break; /* Out of memory */
}
/*
- * Associate the page with swap entry in the swap cache.
- * May fail (-ENOENT) if swap entry has been freed since
- * our caller observed it. May fail (-EEXIST) if there
- * is already a page associated with this entry in the
- * swap cache: added by a racing read_swap_cache_async,
- * or by try_to_swap_out (or shmem_writepage) re-using
- * the just freed swap entry for an existing page.
- * May fail (-ENOMEM) if radix-tree node allocation failed.
+ * call radix_tree_preload() while we can wait.
+ */
+ err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
+ if (err)
+ break;
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
*/
- err = add_to_swap_cache(new_page, entry);
- if (!err) {
+ err = swapcache_prepare(entry);
+ if (err == -EEXIST) {
+ radix_tree_preload_end();
+ /*
+ * We might race against get_swap_page() and stumble
+ * across a SWAP_HAS_CACHE swap_map entry whose page
+ * has not been brought into the swapcache yet, while
+ * the other end is scheduled away waiting on discard
+ * I/O completion at scan_swap_map().
+ *
+ * In order to avoid turning this transitory state
+ * into a permanent loop around this -EEXIST case
+ * if !CONFIG_PREEMPT and the I/O completion happens
+ * to be waiting on the CPU waitqueue where we are now
+ * busy looping, we just conditionally invoke the
+ * scheduler here, if there are some more important
+ * tasks to run.
+ */
+ cond_resched();
+ continue;
+ }
+ if (err) { /* swp entry is obsolete ? */
+ radix_tree_preload_end();
+ break;
+ }
+
+ /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+ err = __add_to_swap_cache(new_page, entry);
+ if (likely(!err)) {
+ radix_tree_preload_end();
/*
* Initiate read into locked page and return.
*/
- lru_cache_add_active(new_page);
- swap_readpage(NULL, new_page);
+ lru_cache_add_anon(new_page);
+ swap_readpage(new_page);
return new_page;
}
- } while (err != -ENOENT && err != -ENOMEM);
+ radix_tree_preload_end();
+ ClearPageSwapBacked(new_page);
+ __clear_page_locked(new_page);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
+ swapcache_free(entry, NULL);
+ } while (err != -ENOMEM);
if (new_page)
page_cache_release(new_page);
return found_page;
}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int pages, max_pages, last_ra;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << ACCESS_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ prev_offset = offset;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = atomic_read(&last_readahead_pages) / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vma: user vma this address belongs to
+ * @addr: target address for mempolicy
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time. We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct page *page;
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
+ unsigned long start_offset, end_offset;
+ unsigned long mask;
+ struct blk_plug plug;
+
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
+ /* Read a page_cluster sized and aligned cluster around offset. */
+ start_offset = offset & ~mask;
+ end_offset = offset | mask;
+ if (!start_offset) /* First page is swap header. */
+ start_offset++;
+
+ blk_start_plug(&plug);
+ for (offset = start_offset; offset <= end_offset ; offset++) {
+ /* Ok, do the async read-ahead now */
+ page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr);
+ if (!page)
+ continue;
+ if (offset != entry_offset)
+ SetPageReadahead(page);
+ page_cache_release(page);
+ }
+ blk_finish_plug(&plug);
+
+ lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
+ return read_swap_cache_async(entry, gfp_mask, vma, addr);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec78378..4c524f7bd0b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,82 +14,485 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
#include <linux/blkdev.h>
+#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
-#include <linux/module.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
+#include <linux/memcontrol.h>
+#include <linux/poll.h>
+#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+#include <linux/export.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
+#include <linux/page_cgroup.h>
+
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static unsigned int nr_swapfiles;
+atomic_long_t nr_swap_pages;
+/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int swap_overflow;
+static int least_priority;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
-static struct swap_info_struct swap_info[MAX_SWAPFILES];
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
+
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
+static inline unsigned char swap_count(unsigned char ent)
+{
+ return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
+}
+
+/* returns 1 if swap entry is freed */
+static int
+__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+{
+ swp_entry_t entry = swp_entry(si->type, offset);
+ struct page *page;
+ int ret = 0;
+
+ page = find_get_page(swap_address_space(entry), entry.val);
+ if (!page)
+ return 0;
+ /*
+ * This function is called from scan_swap_map() and it's called
+ * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
+ * We have to use trylock for avoiding deadlock. This is a special
+ * case and you should use try_to_free_swap() with explicit lock_page()
+ * in usual operations.
+ */
+ if (trylock_page(page)) {
+ ret = try_to_free_swap(page);
+ unlock_page(page);
+ }
+ page_cache_release(page);
+ return ret;
+}
+
/*
- * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a mutex.
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
*/
-static DECLARE_RWSEM(swap_unplug_sem);
+static int discard_swap(struct swap_info_struct *si)
+{
+ struct swap_extent *se;
+ sector_t start_block;
+ sector_t nr_blocks;
+ int err = 0;
+
+ /* Do not discard the swap header page! */
+ se = &si->first_swap_extent;
+ start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
+ nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
+ if (nr_blocks) {
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, 0);
+ if (err)
+ return err;
+ cond_resched();
+ }
+
+ list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ start_block = se->start_block << (PAGE_SHIFT - 9);
+ nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+
+ err = blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_KERNEL, 0);
+ if (err)
+ break;
+
+ cond_resched();
+ }
+ return err; /* That will often be -EOPNOTSUPP */
+}
-void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+ pgoff_t start_page, pgoff_t nr_pages)
{
- swp_entry_t entry;
+ struct swap_extent *se = si->curr_swap_extent;
+ int found_extent = 0;
- down_read(&swap_unplug_sem);
- entry.val = page_private(page);
- if (PageSwapCache(page)) {
- struct block_device *bdev = swap_info[swp_type(entry)].bdev;
- struct backing_dev_info *bdi;
+ while (nr_pages) {
+ struct list_head *lh;
- /*
- * If the page is removed from swapcache from under us (with a
- * racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page_private(page) above.
- * If the WARN_ON triggers during a swapoff it maybe the race
- * condition and it's harmless. However if it triggers without
- * swapoff it signals a problem.
- */
- WARN_ON(page_count(page) <= 1);
+ if (se->start_page <= start_page &&
+ start_page < se->start_page + se->nr_pages) {
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
+
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
+
+ if (!found_extent++)
+ si->curr_swap_extent = se;
+
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO, 0))
+ break;
+ }
- bdi = bdev->bd_inode->i_mapping->backing_dev_info;
- blk_run_backing_dev(bdi, page);
+ lh = se->list.next;
+ se = list_entry(lh, struct swap_extent, list);
}
- up_read(&swap_unplug_sem);
}
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
-static inline unsigned long scan_swap_map(struct swap_info_struct *si)
+static inline void cluster_set_flag(struct swap_cluster_info *info,
+ unsigned int flag)
{
- unsigned long offset, last_in_cluster;
+ info->flags = flag;
+}
+
+static inline unsigned int cluster_count(struct swap_cluster_info *info)
+{
+ return info->data;
+}
+
+static inline void cluster_set_count(struct swap_cluster_info *info,
+ unsigned int c)
+{
+ info->data = c;
+}
+
+static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+ unsigned int c, unsigned int f)
+{
+ info->flags = f;
+ info->data = c;
+}
+
+static inline unsigned int cluster_next(struct swap_cluster_info *info)
+{
+ return info->data;
+}
+
+static inline void cluster_set_next(struct swap_cluster_info *info,
+ unsigned int n)
+{
+ info->data = n;
+}
+
+static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+ unsigned int n, unsigned int f)
+{
+ info->flags = f;
+ info->data = n;
+}
+
+static inline bool cluster_is_free(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_FREE;
+}
+
+static inline bool cluster_is_null(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_NEXT_NULL;
+}
+
+static inline void cluster_set_null(struct swap_cluster_info *info)
+{
+ info->flags = CLUSTER_FLAG_NEXT_NULL;
+ info->data = 0;
+}
+
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+ unsigned int idx)
+{
+ /*
+ * If scan_swap_map() can't find a free cluster, it will check
+ * si->swap_map directly. To make sure the discarding cluster isn't
+ * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+ * will be cleared after discard
+ */
+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+ if (cluster_is_null(&si->discard_cluster_head)) {
+ cluster_set_next_flag(&si->discard_cluster_head,
+ idx, 0);
+ cluster_set_next_flag(&si->discard_cluster_tail,
+ idx, 0);
+ } else {
+ unsigned int tail = cluster_next(&si->discard_cluster_tail);
+ cluster_set_next(&si->cluster_info[tail], idx);
+ cluster_set_next_flag(&si->discard_cluster_tail,
+ idx, 0);
+ }
+
+ schedule_work(&si->discard_work);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+ struct swap_cluster_info *info;
+ unsigned int idx;
+
+ info = si->cluster_info;
+
+ while (!cluster_is_null(&si->discard_cluster_head)) {
+ idx = cluster_next(&si->discard_cluster_head);
+
+ cluster_set_next_flag(&si->discard_cluster_head,
+ cluster_next(&info[idx]), 0);
+ if (cluster_next(&si->discard_cluster_tail) == idx) {
+ cluster_set_null(&si->discard_cluster_head);
+ cluster_set_null(&si->discard_cluster_tail);
+ }
+ spin_unlock(&si->lock);
+
+ discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+ SWAPFILE_CLUSTER);
+
+ spin_lock(&si->lock);
+ cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+ if (cluster_is_null(&si->free_cluster_head)) {
+ cluster_set_next_flag(&si->free_cluster_head,
+ idx, 0);
+ cluster_set_next_flag(&si->free_cluster_tail,
+ idx, 0);
+ } else {
+ unsigned int tail;
+
+ tail = cluster_next(&si->free_cluster_tail);
+ cluster_set_next(&info[tail], idx);
+ cluster_set_next_flag(&si->free_cluster_tail,
+ idx, 0);
+ }
+ memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+ 0, SWAPFILE_CLUSTER);
+ }
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+ struct swap_info_struct *si;
+
+ si = container_of(work, struct swap_info_struct, discard_work);
+
+ spin_lock(&si->lock);
+ swap_do_scheduled_discard(si);
+ spin_unlock(&si->lock);
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+ unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+ if (!cluster_info)
+ return;
+ if (cluster_is_free(&cluster_info[idx])) {
+ VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
+ cluster_set_next_flag(&p->free_cluster_head,
+ cluster_next(&cluster_info[idx]), 0);
+ if (cluster_next(&p->free_cluster_tail) == idx) {
+ cluster_set_null(&p->free_cluster_tail);
+ cluster_set_null(&p->free_cluster_head);
+ }
+ cluster_set_count_flag(&cluster_info[idx], 0, 0);
+ }
+
+ VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+ cluster_set_count(&cluster_info[idx],
+ cluster_count(&cluster_info[idx]) + 1);
+}
+
+/*
+ * The cluster corresponding to page_nr decreases one usage. If the usage
+ * counter becomes 0, which means no page in the cluster is in using, we can
+ * optionally discard the cluster and add it to free cluster list.
+ */
+static void dec_cluster_info_page(struct swap_info_struct *p,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+ unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+ if (!cluster_info)
+ return;
+
+ VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+ cluster_set_count(&cluster_info[idx],
+ cluster_count(&cluster_info[idx]) - 1);
+
+ if (cluster_count(&cluster_info[idx]) == 0) {
+ /*
+ * If the swap is discardable, prepare discard the cluster
+ * instead of free it immediately. The cluster will be freed
+ * after discard.
+ */
+ if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+ (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+ swap_cluster_schedule_discard(p, idx);
+ return;
+ }
+
+ cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+ if (cluster_is_null(&p->free_cluster_head)) {
+ cluster_set_next_flag(&p->free_cluster_head, idx, 0);
+ cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+ } else {
+ unsigned int tail = cluster_next(&p->free_cluster_tail);
+ cluster_set_next(&cluster_info[tail], idx);
+ cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+ }
+ }
+}
+
+/*
+ * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * cluster list. Avoiding such abuse to avoid list corruption.
+ */
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ struct percpu_cluster *percpu_cluster;
+ bool conflict;
+
+ offset /= SWAPFILE_CLUSTER;
+ conflict = !cluster_is_null(&si->free_cluster_head) &&
+ offset != cluster_next(&si->free_cluster_head) &&
+ cluster_is_free(&si->cluster_info[offset]);
+
+ if (!conflict)
+ return false;
+
+ percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+ cluster_set_null(&percpu_cluster->index);
+ return true;
+}
+
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+ unsigned long *offset, unsigned long *scan_base)
+{
+ struct percpu_cluster *cluster;
+ bool found_free;
+ unsigned long tmp;
+
+new_cluster:
+ cluster = this_cpu_ptr(si->percpu_cluster);
+ if (cluster_is_null(&cluster->index)) {
+ if (!cluster_is_null(&si->free_cluster_head)) {
+ cluster->index = si->free_cluster_head;
+ cluster->next = cluster_next(&cluster->index) *
+ SWAPFILE_CLUSTER;
+ } else if (!cluster_is_null(&si->discard_cluster_head)) {
+ /*
+ * we don't have free cluster but have some clusters in
+ * discarding, do discard now and reclaim them
+ */
+ swap_do_scheduled_discard(si);
+ *scan_base = *offset = si->cluster_next;
+ goto new_cluster;
+ } else
+ return;
+ }
+
+ found_free = false;
+
+ /*
+ * Other CPUs can use our cluster if they can't find a free cluster,
+ * check if there is still free entry in the cluster
+ */
+ tmp = cluster->next;
+ while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+ SWAPFILE_CLUSTER) {
+ if (!si->swap_map[tmp]) {
+ found_free = true;
+ break;
+ }
+ tmp++;
+ }
+ if (!found_free) {
+ cluster_set_null(&cluster->index);
+ goto new_cluster;
+ }
+ cluster->next = tmp + 1;
+ *offset = tmp;
+ *scan_base = tmp;
+}
+
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
+{
+ unsigned long offset;
+ unsigned long scan_base;
+ unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
- /*
+ /*
* We try to cluster swap pages by allocating them sequentially
* in swap. Once we've allocated SWAPFILE_CLUSTER pages this
* way, however, we resort to first-free allocation, starting
@@ -97,16 +500,33 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
* all over the entire swap partition, so that we reduce
* overall disk seek times between swap pages. -- sct
* But we do now try to find an empty cluster. -Andrea
+ * And we let swap pages go all over an SSD partition. Hugh
*/
si->flags += SWP_SCANNING;
- if (unlikely(!si->cluster_nr)) {
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
- goto lowest;
- spin_unlock(&swap_lock);
+ scan_base = offset = si->cluster_next;
+
+ /* SSD algorithm */
+ if (si->cluster_info) {
+ scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+ goto checks;
+ }
- offset = si->lowest_bit;
+ if (unlikely(!si->cluster_nr--)) {
+ if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ goto checks;
+ }
+
+ spin_unlock(&si->lock);
+
+ /*
+ * If seek is expensive, start searching for new cluster from
+ * start of partition, to minimize the span of allocated swap.
+ * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+ * case, just handled by scan_swap_map_try_ssd_cluster() above.
+ */
+ scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
/* Locate the first empty (unaligned) cluster */
@@ -114,48 +534,78 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
- spin_lock(&swap_lock);
- si->cluster_next = offset-SWAPFILE_CLUSTER+1;
- goto cluster;
+ spin_lock(&si->lock);
+ offset -= SWAPFILE_CLUSTER - 1;
+ si->cluster_next = offset;
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
+ goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
}
}
- spin_lock(&swap_lock);
- goto lowest;
+
+ offset = scan_base;
+ spin_lock(&si->lock);
+ si->cluster_nr = SWAPFILE_CLUSTER - 1;
}
- si->cluster_nr--;
-cluster:
- offset = si->cluster_next;
- if (offset > si->highest_bit)
-lowest: offset = si->lowest_bit;
-checks: if (!(si->flags & SWP_WRITEOK))
+checks:
+ if (si->cluster_info) {
+ while (scan_swap_map_ssd_cluster_conflict(si, offset))
+ scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+ }
+ if (!(si->flags & SWP_WRITEOK))
goto no_page;
if (!si->highest_bit)
goto no_page;
- if (!si->swap_map[offset]) {
- if (offset == si->lowest_bit)
- si->lowest_bit++;
- if (offset == si->highest_bit)
- si->highest_bit--;
- si->inuse_pages++;
- if (si->inuse_pages == si->pages) {
- si->lowest_bit = si->max;
- si->highest_bit = 0;
- }
- si->swap_map[offset] = 1;
- si->cluster_next = offset + 1;
- si->flags -= SWP_SCANNING;
- return offset;
+ if (offset > si->highest_bit)
+ scan_base = offset = si->lowest_bit;
+
+ /* reuse swap entry of cache-only swap if not busy. */
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ int swap_was_freed;
+ spin_unlock(&si->lock);
+ swap_was_freed = __try_to_reclaim_swap(si, offset);
+ spin_lock(&si->lock);
+ /* entry was freed successfully, try to use this again */
+ if (swap_was_freed)
+ goto checks;
+ goto scan; /* check next one */
}
- spin_unlock(&swap_lock);
+ if (si->swap_map[offset])
+ goto scan;
+
+ if (offset == si->lowest_bit)
+ si->lowest_bit++;
+ if (offset == si->highest_bit)
+ si->highest_bit--;
+ si->inuse_pages++;
+ if (si->inuse_pages == si->pages) {
+ si->lowest_bit = si->max;
+ si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ si->swap_map[offset] = usage;
+ inc_cluster_info_page(si, si->cluster_info, offset);
+ si->cluster_next = offset + 1;
+ si->flags -= SWP_SCANNING;
+
+ return offset;
+
+scan:
+ spin_unlock(&si->lock);
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
@@ -163,8 +613,23 @@ checks: if (!(si->flags & SWP_WRITEOK))
latency_ration = LATENCY_LIMIT;
}
}
- spin_lock(&swap_lock);
- goto lowest;
+ offset = si->lowest_bit;
+ while (offset < scan_base) {
+ if (!si->swap_map[offset]) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ goto checks;
+ }
+ if (unlikely(--latency_ration < 0)) {
+ cond_resched();
+ latency_ration = LATENCY_LIMIT;
+ }
+ offset++;
+ }
+ spin_lock(&si->lock);
no_page:
si->flags -= SWP_SCANNING;
@@ -173,68 +638,93 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- spin_lock(&swap_lock);
- if (nr_swap_pages <= 0)
+ if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
- nr_swap_pages--;
-
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- si = swap_info + type;
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next].prio)) {
- next = swap_list.head;
- wrapped++;
+ atomic_long_dec(&nr_swap_pages);
+
+ spin_lock(&swap_avail_lock);
+
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ spin_lock(&si->lock);
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&si->lock);
+ goto nextsi;
}
- if (!si->highest_bit)
- continue;
- if (!(si->flags & SWP_WRITEOK))
- continue;
-
- swap_list.next = next;
- offset = scan_swap_map(si);
- if (offset) {
- spin_unlock(&swap_lock);
- return swp_entry(type, offset);
- }
- next = swap_list.next;
+ /* This is called for allocating swap entry for cache */
+ offset = scan_swap_map(si, SWAP_HAS_CACHE);
+ spin_unlock(&si->lock);
+ if (offset)
+ return swp_entry(si->type, offset);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
+ */
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
- nr_swap_pages++;
+ spin_unlock(&swap_avail_lock);
+
+ atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
+/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
pgoff_t offset;
- spin_lock(&swap_lock);
- si = swap_info + type;
- if (si->flags & SWP_WRITEOK) {
- nr_swap_pages--;
- offset = scan_swap_map(si);
+ si = swap_info[type];
+ spin_lock(&si->lock);
+ if (si && (si->flags & SWP_WRITEOK)) {
+ atomic_long_dec(&nr_swap_pages);
+ /* This is called for allocating swap entry, not cache */
+ offset = scan_swap_map(si, 1);
if (offset) {
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
return swp_entry(type, offset);
}
- nr_swap_pages++;
+ atomic_long_inc(&nr_swap_pages);
}
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
return (swp_entry_t) {0};
}
-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
if (!entry.val)
@@ -242,7 +732,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_nofile;
- p = & swap_info[type];
+ p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -250,64 +740,129 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;
- spin_lock(&swap_lock);
+ spin_lock(&p->lock);
return p;
bad_free:
- printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+ pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
goto out;
bad_offset:
- printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
goto out;
bad_device:
- printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+ pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
goto out;
bad_nofile:
- printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+ pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
out:
return NULL;
-}
-
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
-{
- int count = p->swap_map[offset];
-
- if (count < SWAP_MAP_MAX) {
- count--;
- p->swap_map[offset] = count;
- if (!count) {
- if (offset < p->lowest_bit)
- p->lowest_bit = offset;
- if (offset > p->highest_bit)
- p->highest_bit = offset;
- if (p->prio > swap_info[swap_list.next].prio)
- swap_list.next = p - swap_info;
- nr_swap_pages++;
- p->inuse_pages--;
+}
+
+static unsigned char swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ unsigned long offset = swp_offset(entry);
+ unsigned char count;
+ unsigned char has_cache;
+
+ count = p->swap_map[offset];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+
+ if (usage == SWAP_HAS_CACHE) {
+ VM_BUG_ON(!has_cache);
+ has_cache = 0;
+ } else if (count == SWAP_MAP_SHMEM) {
+ /*
+ * Or we could insist on shmem.c using a special
+ * swap_shmem_free() and free_shmem_swap_and_cache()...
+ */
+ count = 0;
+ } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+ if (count == COUNT_CONTINUED) {
+ if (swap_count_continued(p, offset, count))
+ count = SWAP_MAP_MAX | COUNT_CONTINUED;
+ else
+ count = SWAP_MAP_MAX;
+ } else
+ count--;
+ }
+
+ if (!count)
+ mem_cgroup_uncharge_swap(entry);
+
+ usage = count | has_cache;
+ p->swap_map[offset] = usage;
+
+ /* free if no reference */
+ if (!usage) {
+ dec_cluster_info_page(p, p->cluster_info, offset);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
+ p->highest_bit = offset;
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
+ atomic_long_inc(&nr_swap_pages);
+ p->inuse_pages--;
+ frontswap_invalidate_page(p->type, offset);
+ if (p->flags & SWP_BLKDEV) {
+ struct gendisk *disk = p->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev,
+ offset);
}
}
- return count;
+
+ return usage;
}
/*
- * Caller has made sure that the swapdevice corresponding to entry
+ * Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
p = swap_info_get(entry);
if (p) {
- swap_entry_free(p, swp_offset(entry));
- spin_unlock(&swap_lock);
+ swap_entry_free(p, entry, 1);
+ spin_unlock(&p->lock);
+ }
+}
+
+/*
+ * Called after dropping swapcache to decrease refcnt to swap entries.
+ */
+void swapcache_free(swp_entry_t entry, struct page *page)
+{
+ struct swap_info_struct *p;
+ unsigned char count;
+
+ p = swap_info_get(entry);
+ if (p) {
+ count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
+ if (page)
+ mem_cgroup_uncharge_swapcache(page, entry, count != 0);
+ spin_unlock(&p->lock);
}
}
/*
* How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
*/
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
@@ -316,149 +871,181 @@ static inline int page_swapcount(struct page *page)
entry.val = page_private(page);
p = swap_info_get(entry);
if (p) {
- /* Subtract the 1 for the swap cache itself */
- count = p->swap_map[swp_offset(entry)] - 1;
- spin_unlock(&swap_lock);
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ spin_unlock(&p->lock);
}
return count;
}
/*
- * We can use this swap cache entry directly
- * if there are no other references to it.
+ * We can write to an anon page without COW if there are no other references
+ * to it. And as a side-effect, free up its swap: because the old content
+ * on disk will never be read, and seeking back there to write new content
+ * later would only waste time away from clustering.
*/
-int can_share_swap_page(struct page *page)
+int reuse_swap_page(struct page *page)
{
int count;
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page)))
+ return 0;
count = page_mapcount(page);
- if (count <= 1 && PageSwapCache(page))
+ if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
- return count == 1;
+ if (count == 1 && !PageWriteback(page)) {
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ }
+ }
+ return count <= 1;
}
/*
- * Work out if there are any other processes sharing this
- * swap cache page. Free it if you can. Return success.
+ * If swap is getting full, or if there are no more mappings of this page,
+ * then try_to_free_swap is called to free its swap space.
*/
-int remove_exclusive_swap_page(struct page *page)
+int try_to_free_swap(struct page *page)
{
- int retval;
- struct swap_info_struct * p;
- swp_entry_t entry;
-
- BUG_ON(PagePrivate(page));
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!PageSwapCache(page))
return 0;
if (PageWriteback(page))
return 0;
- if (page_count(page) != 2) /* 2: us + cache */
+ if (page_swapcount(page))
return 0;
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (!p)
+ /*
+ * Once hibernation has begun to create its image of memory,
+ * there's a danger that one of the calls to try_to_free_swap()
+ * - most probably a call from __try_to_reclaim_swap() while
+ * hibernation is allocating its own swap pages for the image,
+ * but conceivably even a call from memory reclaim - will free
+ * the swap from a page which has already been recorded in the
+ * image as a clean swapcache page, and then reuse its swap for
+ * another page of the image. On waking from hibernation, the
+ * original page might be freed under memory pressure, then
+ * later read back in from swap, now with the wrong data.
+ *
+ * Hibernation suspends storage while it is writing the image
+ * to disk so check that here.
+ */
+ if (pm_suspended_storage())
return 0;
- /* Is the only swap cache user the cache itself? */
- retval = 0;
- if (p->swap_map[swp_offset(entry)] == 1) {
- /* Recheck the page count with the swapcache lock held.. */
- write_lock_irq(&swapper_space.tree_lock);
- if ((page_count(page) == 2) && !PageWriteback(page)) {
- __delete_from_swap_cache(page);
- SetPageDirty(page);
- retval = 1;
- }
- write_unlock_irq(&swapper_space.tree_lock);
- }
- spin_unlock(&swap_lock);
-
- if (retval) {
- swap_free(entry);
- page_cache_release(page);
- }
-
- return retval;
+ delete_from_swap_cache(page);
+ SetPageDirty(page);
+ return 1;
}
/*
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
*/
-void free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
struct page *page = NULL;
- if (is_migration_entry(entry))
- return;
+ if (non_swap_entry(entry))
+ return 1;
p = swap_info_get(entry);
if (p) {
- if (swap_entry_free(p, swp_offset(entry)) == 1) {
- page = find_get_page(&swapper_space, entry.val);
- if (page && unlikely(TestSetPageLocked(page))) {
+ if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+ page = find_get_page(swap_address_space(entry),
+ entry.val);
+ if (page && !trylock_page(page)) {
page_cache_release(page);
page = NULL;
}
}
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
}
if (page) {
- int one_user;
-
- BUG_ON(PagePrivate(page));
- one_user = (page_count(page) == 2);
- /* Only cache user (+us), or swap space full? Free it! */
- /* Also recheck PageSwapCache after page is locked (above) */
+ /*
+ * Not mapped elsewhere, or swap space full? Free it!
+ * Also recheck PageSwapCache now page is locked (above).
+ */
if (PageSwapCache(page) && !PageWriteback(page) &&
- (one_user || vm_swap_full())) {
+ (!page_mapped(page) || vm_swap_full())) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
unlock_page(page);
page_cache_release(page);
}
+ return p != NULL;
}
-#ifdef CONFIG_SOFTWARE_SUSPEND
+#ifdef CONFIG_HIBERNATION
/*
- * Find the swap type that corresponds to given device (if any)
+ * Find the swap type that corresponds to given device (if any).
+ *
+ * @offset - number of the PAGE_SIZE-sized block of the device, starting
+ * from 0, in which the swap header is expected to be located.
*
- * This is needed for software suspend and is done in such a way that inode
- * aliasing is allowed.
+ * This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device)
+int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
{
- int i;
+ struct block_device *bdev = NULL;
+ int type;
+
+ if (device)
+ bdev = bdget(device);
spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- struct inode *inode;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
- if (!(swap_info[i].flags & SWP_WRITEOK))
+ if (!(sis->flags & SWP_WRITEOK))
continue;
- if (!device) {
+ if (!bdev) {
+ if (bdev_p)
+ *bdev_p = bdgrab(sis->bdev);
+
spin_unlock(&swap_lock);
- return i;
+ return type;
}
- inode = swap_info[i].swap_file->f_dentry->d_inode;
- if (S_ISBLK(inode->i_mode) &&
- device == MKDEV(imajor(inode), iminor(inode))) {
- spin_unlock(&swap_lock);
- return i;
+ if (bdev == sis->bdev) {
+ struct swap_extent *se = &sis->first_swap_extent;
+
+ if (se->start_block == offset) {
+ if (bdev_p)
+ *bdev_p = bdgrab(sis->bdev);
+
+ spin_unlock(&swap_lock);
+ bdput(bdev);
+ return type;
+ }
}
}
spin_unlock(&swap_lock);
+ if (bdev)
+ bdput(bdev);
+
return -ENODEV;
}
/*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+ struct block_device *bdev;
+
+ if ((unsigned int)type >= nr_swapfiles)
+ return 0;
+ if (!(swap_info[type]->flags & SWP_WRITEOK))
+ return 0;
+ return map_swap_entry(swp_entry(type, offset), &bdev);
+}
+
+/*
* Return either the total number of swap pages of given type, or the number
* of free pages of that type (depending on @free)
*
@@ -468,38 +1055,94 @@ unsigned int count_swap_pages(int type, int free)
{
unsigned int n = 0;
- if (type < nr_swapfiles) {
- spin_lock(&swap_lock);
- if (swap_info[type].flags & SWP_WRITEOK) {
- n = swap_info[type].pages;
+ spin_lock(&swap_lock);
+ if ((unsigned int)type < nr_swapfiles) {
+ struct swap_info_struct *sis = swap_info[type];
+
+ spin_lock(&sis->lock);
+ if (sis->flags & SWP_WRITEOK) {
+ n = sis->pages;
if (free)
- n -= swap_info[type].inuse_pages;
+ n -= sis->inuse_pages;
}
- spin_unlock(&swap_lock);
+ spin_unlock(&sis->lock);
}
+ spin_unlock(&swap_lock);
return n;
}
+#endif /* CONFIG_HIBERNATION */
+
+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ /*
+ * When pte keeps soft dirty bit the pte generated
+ * from swap entry does not has it, still it's same
+ * pte from logical point of view.
+ */
+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
+#else
+ return pte_same(pte, swp_pte);
#endif
+}
/*
* No need to decide whether this PTE shares the swap entry with others,
* just let do_wp_page work it out if a write is requested later - to
* force COW, vm_page_prot omits write permission from any private vma.
*/
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- inc_mm_counter(vma->vm_mm, anon_rss);
+ struct page *swapcache;
+ struct mem_cgroup *memcg;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret = 1;
+
+ swapcache = page;
+ page = ksm_might_need_to_copy(page, vma, addr);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
+ GFP_KERNEL, &memcg)) {
+ ret = -ENOMEM;
+ goto out_nolock;
+ }
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
+ mem_cgroup_cancel_charge_swapin(memcg);
+ ret = 0;
+ goto out;
+ }
+
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
- page_add_anon_rmap(page, vma, addr);
+ if (page == swapcache)
+ page_add_anon_rmap(page, vma, addr);
+ else /* ksm created a completely new copy */
+ page_add_new_anon_rmap(page, vma, addr);
+ mem_cgroup_commit_charge_swapin(page, memcg);
swap_free(entry);
/*
* Move the page to the active list so it is not
* immediately swapped out again after swapon.
*/
activate_page(page);
+out:
+ pte_unmap_unlock(pte, ptl);
+out_nolock:
+ if (page != swapcache) {
+ unlock_page(page);
+ put_page(page);
+ }
+ return ret;
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -508,23 +1151,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
{
pte_t swp_pte = swp_entry_to_pte(entry);
pte_t *pte;
- spinlock_t *ptl;
- int found = 0;
+ int ret = 0;
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ /*
+ * We don't actually need pte lock while scanning for swp_pte: since
+ * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
+ * page table while we're scanning; though it could get zapped, and on
+ * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
+ * of unmatched parts which look like swp_pte, so unuse_pte must
+ * recheck under pte lock. Scanning without pte lock lets it be
+ * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ */
+ pte = pte_offset_map(pmd, addr);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
- if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte++, addr, entry, page);
- found = 1;
- break;
+ if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+ pte_unmap(pte);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret)
+ goto out;
+ pte = pte_offset_map(pmd, addr);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(pte - 1, ptl);
- return found;
+ pte_unmap(pte - 1);
+out:
+ return ret;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -533,14 +1187,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int ret;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- if (unuse_pte_range(vma, pmd, addr, next, entry, page))
- return 1;
+ ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pmd++, addr = next, addr != end);
return 0;
}
@@ -551,14 +1207,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
{
pud_t *pud;
unsigned long next;
+ int ret;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (unuse_pmd_range(vma, pud, addr, next, entry, page))
- return 1;
+ ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -568,8 +1226,9 @@ static int unuse_vma(struct vm_area_struct *vma,
{
pgd_t *pgd;
unsigned long addr, end, next;
+ int ret;
- if (page->mapping) {
+ if (page_anon_vma(page)) {
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
return 0;
@@ -585,8 +1244,9 @@ static int unuse_vma(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (unuse_pud_range(vma, pgd, addr, next, entry, page))
- return 1;
+ ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
@@ -595,11 +1255,12 @@ static int unuse_mm(struct mm_struct *mm,
swp_entry_t entry, struct page *page)
{
struct vm_area_struct *vma;
+ int ret = 0;
if (!down_read_trylock(&mm->mmap_sem)) {
/*
- * Activate page so shrink_cache is unlikely to unmap its
- * ptes while lock is dropped, so swapoff can make progress.
+ * Activate page so shrink_inactive_list is unlikely to unmap
+ * its ptes while lock is dropped, so swapoff can make progress.
*/
activate_page(page);
unlock_page(page);
@@ -607,27 +1268,24 @@ static int unuse_mm(struct mm_struct *mm,
lock_page(page);
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && unuse_vma(vma, entry, page))
+ if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
break;
}
up_read(&mm->mmap_sem);
- /*
- * Currently unuse_mm cannot fail, but leave error handling
- * at call sites for now, since we change it from time to time.
- */
- return 0;
+ return (ret < 0)? ret: 0;
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
- int count;
+ unsigned char count;
/*
* No need for swap_lock here: we're just looking
@@ -649,8 +1307,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
- count = si->swap_map[i];
- if (count && count != SWAP_MAP_BAD)
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
+ count = ACCESS_ONCE(si->swap_map[i]);
+ if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
return i;
@@ -660,19 +1324,25 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
- struct swap_info_struct * si = &swap_info[type];
+ struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
- unsigned short *swap_map;
- unsigned short swcount;
+ volatile unsigned char *swap_map; /* swap_map is accessed without
+ * locking. Mark it as volatile
+ * to prevent compiler doing
+ * something odd.
+ */
+ unsigned char swcount;
struct page *page;
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int reset_overflow = 0;
- int shmem;
/*
* When searching mms for an entry, a good strategy is to
@@ -686,8 +1356,7 @@ static int try_to_unuse(unsigned int type)
* together, child after parent. If we race with dup_mmap(), we
* prefer to resolve parent before child, lest we miss entries
* duplicated after we scanned child: using last mm would invert
- * that. Though it's only a serious concern when an overflowed
- * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
+ * that.
*/
start_mm = &init_mm;
atomic_inc(&init_mm.mm_users);
@@ -697,20 +1366,21 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
}
- /*
+ /*
* Get a page for the entry, using the existing swap
* cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
+ * page and read the swap into it.
*/
swap_map = &si->swap_map[i];
entry = swp_entry(type, i);
- page = read_swap_cache_async(entry, NULL, 0);
+ page = read_swap_cache_async(entry,
+ GFP_HIGHUSER_MOVABLE, NULL, 0);
if (!page) {
/*
* Either swap_duplicate() failed because entry
@@ -718,7 +1388,15 @@ static int try_to_unuse(unsigned int type)
* reused since sys_swapoff() already disabled
* allocation from here, or alloc_page() failed.
*/
- if (!*swap_map)
+ swcount = *swap_map;
+ /*
+ * We don't hold lock here, so the swap entry could be
+ * SWAP_MAP_BAD (when the cluster is discarding).
+ * Instead of fail out, We can just skip the swap
+ * entry because swapoff will wait for discarding
+ * finish anyway.
+ */
+ if (!swcount || swcount == SWAP_MAP_BAD)
continue;
retval = -ENOMEM;
break;
@@ -748,18 +1426,19 @@ static int try_to_unuse(unsigned int type)
/*
* Remove all references to entry.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
*/
- shmem = 0;
swcount = *swap_map;
- if (swcount > 1) {
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- retval = unuse_mm(start_mm, entry, page);
+ if (swap_count(swcount) == SWAP_MAP_SHMEM) {
+ retval = shmem_unuse(entry, page);
+ /* page has already been unlocked and released */
+ if (retval < 0)
+ break;
+ continue;
}
- if (*swap_map > 1) {
+ if (swap_count(swcount) && start_mm != &init_mm)
+ retval = unuse_mm(start_mm, entry, page);
+
+ if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm;
@@ -769,7 +1448,7 @@ static int try_to_unuse(unsigned int type)
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval &&
+ while (swap_count(*swap_map) && !retval &&
(p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist);
if (!atomic_inc_not_zero(&mm->mm_users))
@@ -781,13 +1460,13 @@ static int try_to_unuse(unsigned int type)
cond_resched();
swcount = *swap_map;
- if (swcount <= 1)
+ if (!swap_count(swcount)) /* any usage ? */
;
- else if (mm == &init_mm) {
+ else if (mm == &init_mm)
set_start_mm = 1;
- shmem = shmem_unuse(entry, page);
- } else
+ else
retval = unuse_mm(mm, entry, page);
+
if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
@@ -808,26 +1487,6 @@ static int try_to_unuse(unsigned int type)
}
/*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- if (*swap_map == SWAP_MAP_MAX) {
- spin_lock(&swap_lock);
- *swap_map = 1;
- spin_unlock(&swap_lock);
- reset_overflow = 1;
- }
-
- /*
* If a reference remains (rare), we would like to leave
* the page in the swap cache; but try_to_unmap could
* then re-duplicate the entry once we drop page lock,
@@ -840,13 +1499,14 @@ static int try_to_unuse(unsigned int type)
* pages would be incorrect if swap supported "shared
* private" pages, but they are handled by tmpfs files.
*
- * Note shmem_unuse already deleted a swappage from
- * the swap cache, unless the move to filepage failed:
- * in which case it left swappage in cache, lowered its
- * swap count to pass quickly through the loops above,
- * and now we must reincrement count to try again later.
+ * Given how unuse_vma() targets one particular offset
+ * in an anon_vma, once the anon_vma has been determined,
+ * this splitting happens to be just what is needed to
+ * handle where KSM pages have been swapped out: re-reading
+ * is unnecessarily slow, but we can fix that later on.
*/
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
+ if (swap_count(*swap_map) &&
+ PageDirty(page) && PageSwapCache(page)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -855,17 +1515,22 @@ static int try_to_unuse(unsigned int type)
lock_page(page);
wait_on_page_writeback(page);
}
- if (PageSwapCache(page)) {
- if (shmem)
- swap_duplicate(entry);
- else
- delete_from_swap_cache(page);
- }
+
+ /*
+ * It is conceivable that a racing task removed this page from
+ * swap cache just before we acquired the page lock at the top,
+ * or while we dropped it in unuse_mm(). The page might even
+ * be back in swap cache on another swap area: that we must not
+ * delete, since it may not have been written out to swap yet.
+ */
+ if (PageSwapCache(page) &&
+ likely(page_private(page) == entry.val))
+ delete_from_swap_cache(page);
/*
* So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
+ * mark page dirty so shrink_page_list will preserve it.
*/
SetPageDirty(page);
unlock_page(page);
@@ -876,13 +1541,13 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
return retval;
}
@@ -895,10 +1560,10 @@ static int try_to_unuse(unsigned int type)
static void drain_mmlist(void)
{
struct list_head *p, *next;
- unsigned int i;
+ unsigned int type;
- for (i = 0; i < nr_swapfiles; i++)
- if (swap_info[i].inuse_pages)
+ for (type = 0; type < nr_swapfiles; type++)
+ if (swap_info[type]->inuse_pages)
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -908,12 +1573,23 @@ static void drain_mmlist(void)
/*
* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
*/
-sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
- struct swap_extent *se = sis->curr_swap_extent;
- struct swap_extent *start_se = se;
+ struct swap_info_struct *sis;
+ struct swap_extent *start_se;
+ struct swap_extent *se;
+ pgoff_t offset;
+
+ sis = swap_info[swp_type(entry)];
+ *bdev = sis->bdev;
+
+ offset = swp_offset(entry);
+ start_se = sis->curr_swap_extent;
+ se = start_se;
for ( ; ; ) {
struct list_head *lh;
@@ -923,8 +1599,6 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
return se->start_block + (offset - se->start_page);
}
lh = se->list.next;
- if (lh == &sis->extent_list)
- lh = lh->next;
se = list_entry(lh, struct swap_extent, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
@@ -932,18 +1606,36 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
}
/*
+ * Returns the page offset into bdev for the specified page's swap entry.
+ */
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
+{
+ swp_entry_t entry;
+ entry.val = page_private(page);
+ return map_swap_entry(entry, bdev);
+}
+
+/*
* Free all of a swapdev's extent information
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->extent_list)) {
+ while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;
- se = list_entry(sis->extent_list.next,
+ se = list_entry(sis->first_swap_extent.list.next,
struct swap_extent, list);
list_del(&se->list);
kfree(se);
}
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ sis->flags &= ~SWP_FILE;
+ mapping->a_ops->swap_deactivate(swap_file);
+ }
}
/*
@@ -952,7 +1644,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
*
* This function rather assumes that it is called in ascending page order.
*/
-static int
+int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
@@ -960,8 +1652,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
struct swap_extent *new_se;
struct list_head *lh;
- lh = sis->extent_list.prev; /* The highest page extent */
- if (lh != &sis->extent_list) {
+ if (start_page == 0) {
+ se = &sis->first_swap_extent;
+ sis->curr_swap_extent = se;
+ se->start_page = 0;
+ se->nr_pages = nr_pages;
+ se->start_block = start_block;
+ return 1;
+ } else {
+ lh = sis->first_swap_extent.list.prev; /* Highest extent */
se = list_entry(lh, struct swap_extent, list);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
@@ -981,7 +1680,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
- list_add_tail(&new_se->list, &sis->extent_list);
+ list_add_tail(&new_se->list, &sis->first_swap_extent.list);
return 1;
}
@@ -1018,267 +1717,278 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
- struct inode *inode;
- unsigned blocks_per_page;
- unsigned long page_no;
- unsigned blkbits;
- sector_t probe_block;
- sector_t last_block;
- sector_t lowest_block = -1;
- sector_t highest_block = 0;
- int nr_extents = 0;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
int ret;
- inode = sis->swap_file->f_mapping->host;
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
- goto done;
+ return ret;
}
- blkbits = inode->i_blkbits;
- blocks_per_page = PAGE_SIZE >> blkbits;
-
- /*
- * Map all the blocks into the extent list. This code doesn't try
- * to be very smart.
- */
- probe_block = 0;
- page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
- while ((probe_block + blocks_per_page) <= last_block &&
- page_no < sis->max) {
- unsigned block_in_page;
- sector_t first_block;
-
- first_block = bmap(inode, probe_block);
- if (first_block == 0)
- goto bad_bmap;
-
- /*
- * It must be PAGE_SIZE aligned on-disk
- */
- if (first_block & (blocks_per_page - 1)) {
- probe_block++;
- goto reprobe;
- }
-
- for (block_in_page = 1; block_in_page < blocks_per_page;
- block_in_page++) {
- sector_t block;
-
- block = bmap(inode, probe_block + block_in_page);
- if (block == 0)
- goto bad_bmap;
- if (block != first_block + block_in_page) {
- /* Discontiguity */
- probe_block++;
- goto reprobe;
- }
- }
-
- first_block >>= (PAGE_SHIFT - blkbits);
- if (page_no) { /* exclude the header page */
- if (first_block < lowest_block)
- lowest_block = first_block;
- if (first_block > highest_block)
- highest_block = first_block;
+ if (mapping->a_ops->swap_activate) {
+ ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (!ret) {
+ sis->flags |= SWP_FILE;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
}
+ return ret;
+ }
- /*
- * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
- */
- ret = add_swap_extent(sis, page_no, 1, first_block);
- if (ret < 0)
- goto out;
- nr_extents += ret;
- page_no++;
- probe_block += blocks_per_page;
-reprobe:
- continue;
- }
- ret = nr_extents;
- *span = 1 + highest_block - lowest_block;
- if (page_no == 0)
- page_no = 1; /* force Empty message */
- sis->max = page_no;
- sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
-done:
- sis->curr_swap_extent = list_entry(sis->extent_list.prev,
- struct swap_extent, list);
- goto out;
-bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
- ret = -EINVAL;
-out:
- return ret;
+ return generic_swapfile_activate(sis, swap_file, span);
}
-#if 0 /* We don't need this yet */
-#include <linux/backing-dev.h>
-int page_queue_congested(struct page *page)
+static void _enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
- struct backing_dev_info *bdi;
-
- BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
+ p->swap_map = swap_map;
+ p->cluster_info = cluster_info;
+ p->flags |= SWP_WRITEOK;
+ atomic_long_add(p->pages, &nr_swap_pages);
+ total_swap_pages += p->pages;
+
+ assert_spin_locked(&swap_lock);
+ /*
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
+ */
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+}
- if (PageSwapCache(page)) {
- swp_entry_t entry = { .val = page_private(page) };
- struct swap_info_struct *sis;
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info,
+ unsigned long *frontswap_map)
+{
+ frontswap_init(p->type, frontswap_map);
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+}
- sis = get_swap_info_struct(swp_type(entry));
- bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
- } else
- bdi = page->mapping->backing_dev_info;
- return bdi_write_congested(bdi);
+static void reinsert_swap_info(struct swap_info_struct *p)
+{
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
}
-#endif
-asmlinkage long sys_swapoff(const char __user * specialfile)
+SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
- struct swap_info_struct * p = NULL;
- unsigned short *swap_map;
+ struct swap_info_struct *p = NULL;
+ unsigned char *swap_map;
+ struct swap_cluster_info *cluster_info;
+ unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char * pathname;
- int i, type, prev;
- int err;
-
+ struct filename *pathname;
+ int err, found = 0;
+ unsigned int old_block_size;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ BUG_ON(!current->mm);
+
pathname = getname(specialfile);
- err = PTR_ERR(pathname);
if (IS_ERR(pathname))
- goto out;
+ return PTR_ERR(pathname);
- victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
- putname(pathname);
+ victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
err = PTR_ERR(victim);
if (IS_ERR(victim))
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
- p = swap_info + type;
- if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
- if (p->swap_file->f_mapping == mapping)
+ plist_for_each_entry(p, &swap_active_head, list) {
+ if (p->flags & SWP_WRITEOK) {
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
}
- if (!security_vm_enough_memory(p->pages))
+ if (!security_vm_enough_memory_mm(current->mm, p->pages))
vm_unacct_memory(p->pages);
else {
err = -ENOMEM;
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0) {
- swap_list.head = p->next;
- } else {
- swap_info[prev].next = p->next;
- }
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ spin_lock(&p->lock);
+ if (p->prio < 0) {
+ struct swap_info_struct *si = p;
+
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
+ }
+ least_priority++;
}
- nr_swap_pages -= p->pages;
+ plist_del(&p->list, &swap_active_head);
+ atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- current->flags |= PF_SWAPOFF;
- err = try_to_unuse(type);
- current->flags &= ~PF_SWAPOFF;
+ set_current_oom_origin();
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ clear_current_oom_origin();
if (err) {
/* re-insert swap space back into swap_list */
- spin_lock(&swap_lock);
- for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
- if (p->prio >= swap_info[i].prio)
- break;
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p - swap_info;
- else
- swap_info[prev].next = p - swap_info;
- nr_swap_pages += p->pages;
- total_swap_pages += p->pages;
- p->flags |= SWP_WRITEOK;
- spin_unlock(&swap_lock);
+ reinsert_swap_info(p);
goto out_dput;
}
- /* wait for any unplug function to finish */
- down_write(&swap_unplug_sem);
- up_write(&swap_unplug_sem);
+ flush_work(&p->discard_work);
destroy_swap_extents(p);
+ if (p->flags & SWP_CONTINUED)
+ free_swap_count_continuations(p);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
drain_mmlist();
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
}
swap_file = p->swap_file;
+ old_block_size = p->old_block_size;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
- p->flags = 0;
+ cluster_info = p->cluster_info;
+ p->cluster_info = NULL;
+ frontswap_map = frontswap_map_get(p);
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ frontswap_invalidate_area(p->type);
+ frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
+ free_percpu(p->percpu_cluster);
+ p->percpu_cluster = NULL;
vfree(swap_map);
+ vfree(cluster_info);
+ vfree(frontswap_map);
+ /* Destroy swap account information */
+ swap_cgroup_swapoff(p->type);
+
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
- set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ set_blocksize(bdev, old_block_size);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
+
+ /*
+ * Clear the SWP_USED flag after all resources are freed so that swapon
+ * can reuse this swap_info in alloc_swap_info() safely. It is ok to
+ * not hold p->lock after we cleared its SWP_WRITEOK.
+ */
+ spin_lock(&swap_lock);
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+
err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
out_dput:
filp_close(victim, NULL);
out:
+ putname(pathname);
return err;
}
#ifdef CONFIG_PROC_FS
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+
+ poll_wait(file, &proc_poll_wait, wait);
+
+ if (seq->poll_event != atomic_read(&proc_poll_event)) {
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+ return POLLIN | POLLRDNORM;
+}
+
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
- struct swap_info_struct *ptr = swap_info;
- int i;
+ struct swap_info_struct *si;
+ int type;
loff_t l = *pos;
mutex_lock(&swapon_mutex);
- for (i = 0; i < nr_swapfiles; i++, ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ if (!l)
+ return SEQ_START_TOKEN;
+
+ for (type = 0; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
- if (!l--)
- return ptr;
+ if (!--l)
+ return si;
}
return NULL;
@@ -1286,14 +1996,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
- struct swap_info_struct *ptr = v;
- struct swap_info_struct *endptr = swap_info + nr_swapfiles;
-
- for (++ptr; ptr < endptr; ptr++) {
- if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+ struct swap_info_struct *si = v;
+ int type;
+
+ if (v == SEQ_START_TOKEN)
+ type = 0;
+ else
+ type = si->type + 1;
+
+ for (; type < nr_swapfiles; type++) {
+ smp_rmb(); /* read nr_swapfiles before swap_info[type] */
+ si = swap_info[type];
+ if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
- return ptr;
+ return si;
}
return NULL;
@@ -1306,26 +2023,28 @@ static void swap_stop(struct seq_file *swap, void *v)
static int swap_show(struct seq_file *swap, void *v)
{
- struct swap_info_struct *ptr = v;
+ struct swap_info_struct *si = v;
struct file *file;
int len;
- if (v == swap_info)
- seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ if (si == SEQ_START_TOKEN) {
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ return 0;
+ }
- file = ptr->swap_file;
- len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
+ file = si->swap_file;
+ len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
- len < 40 ? 40 - len : 1, " ",
- S_ISBLK(file->f_dentry->d_inode->i_mode) ?
+ len < 40 ? 40 - len : 1, " ",
+ S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
- ptr->pages << (PAGE_SHIFT - 10),
- ptr->inuse_pages << (PAGE_SHIFT - 10),
- ptr->prio);
+ si->pages << (PAGE_SHIFT - 10),
+ si->inuse_pages << (PAGE_SHIFT - 10),
+ si->prio);
return 0;
}
-static struct seq_operations swaps_op = {
+static const struct seq_operations swaps_op = {
.start = swap_start,
.next = swap_next,
.stop = swap_stop,
@@ -1334,141 +2053,359 @@ static struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &swaps_op);
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &swaps_op);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->poll_event = atomic_read(&proc_poll_event);
+ return 0;
}
-static struct file_operations proc_swaps_operations = {
+static const struct file_operations proc_swaps_operations = {
.open = swaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
+ .poll = swaps_poll,
};
static int __init procswaps_init(void)
{
- struct proc_dir_entry *entry;
-
- entry = create_proc_entry("swaps", 0, NULL);
- if (entry)
- entry->proc_fops = &proc_swaps_operations;
+ proc_create("swaps", 0, NULL, &proc_swaps_operations);
return 0;
}
__initcall(procswaps_init);
#endif /* CONFIG_PROC_FS */
+#ifdef MAX_SWAPFILES_CHECK
+static int __init max_swapfiles_check(void)
+{
+ MAX_SWAPFILES_CHECK();
+ return 0;
+}
+late_initcall(max_swapfiles_check);
+#endif
+
+static struct swap_info_struct *alloc_swap_info(void)
+{
+ struct swap_info_struct *p;
+ unsigned int type;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ if (!(swap_info[type]->flags & SWP_USED))
+ break;
+ }
+ if (type >= MAX_SWAPFILES) {
+ spin_unlock(&swap_lock);
+ kfree(p);
+ return ERR_PTR(-EPERM);
+ }
+ if (type >= nr_swapfiles) {
+ p->type = type;
+ swap_info[type] = p;
+ /*
+ * Write swap_info[type] before nr_swapfiles, in case a
+ * racing procfs swap_start() or swap_next() is reading them.
+ * (We never shrink nr_swapfiles, we never free this entry.)
+ */
+ smp_wmb();
+ nr_swapfiles++;
+ } else {
+ kfree(p);
+ p = swap_info[type];
+ /*
+ * Do not memset this entry: a racing procfs swap_next()
+ * would be relying on p->type to remain valid.
+ */
+ }
+ INIT_LIST_HEAD(&p->first_swap_extent.list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
+ p->flags = SWP_USED;
+ spin_unlock(&swap_lock);
+ spin_lock_init(&p->lock);
+
+ return p;
+}
+
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+{
+ int error;
+
+ if (S_ISBLK(inode->i_mode)) {
+ p->bdev = bdgrab(I_BDEV(inode));
+ error = blkdev_get(p->bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ sys_swapon);
+ if (error < 0) {
+ p->bdev = NULL;
+ return -EINVAL;
+ }
+ p->old_block_size = block_size(p->bdev);
+ error = set_blocksize(p->bdev, PAGE_SIZE);
+ if (error < 0)
+ return error;
+ p->flags |= SWP_BLKDEV;
+ } else if (S_ISREG(inode->i_mode)) {
+ p->bdev = inode->i_sb->s_bdev;
+ mutex_lock(&inode->i_mutex);
+ if (IS_SWAPFILE(inode))
+ return -EBUSY;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static unsigned long read_swap_header(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ struct inode *inode)
+{
+ int i;
+ unsigned long maxpages;
+ unsigned long swapfilepages;
+ unsigned long last_page;
+
+ if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
+ pr_err("Unable to find swap-space signature\n");
+ return 0;
+ }
+
+ /* swap partition endianess hack... */
+ if (swab32(swap_header->info.version) == 1) {
+ swab32s(&swap_header->info.version);
+ swab32s(&swap_header->info.last_page);
+ swab32s(&swap_header->info.nr_badpages);
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ swab32s(&swap_header->info.badpages[i]);
+ }
+ /* Check the swap header's sub-version */
+ if (swap_header->info.version != 1) {
+ pr_warn("Unable to handle swap header version %d\n",
+ swap_header->info.version);
+ return 0;
+ }
+
+ p->lowest_bit = 1;
+ p->cluster_next = 1;
+ p->cluster_nr = 0;
+
+ /*
+ * Find out how many pages are allowed for a single swap
+ * device. There are two limiting factors: 1) the number
+ * of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte as defined by the
+ * different architectures. In order to find the
+ * largest possible bit mask, a swap entry with swap type 0
+ * and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap
+ * offset is extracted. This will mask all the bits from
+ * the initial ~0UL mask that can't be encoded in either
+ * the swp_entry_t or the architecture definition of a
+ * swap pte.
+ */
+ maxpages = swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ last_page = swap_header->info.last_page;
+ if (last_page > maxpages) {
+ pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
+ maxpages << (PAGE_SHIFT - 10),
+ last_page << (PAGE_SHIFT - 10));
+ }
+ if (maxpages > last_page) {
+ maxpages = last_page + 1;
+ /* p->max is an unsigned int: don't overflow it */
+ if ((unsigned int)maxpages == 0)
+ maxpages = UINT_MAX;
+ }
+ p->highest_bit = maxpages - 1;
+
+ if (!maxpages)
+ return 0;
+ swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+ if (swapfilepages && maxpages > swapfilepages) {
+ pr_warn("Swap area shorter than signature indicates\n");
+ return 0;
+ }
+ if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+ return 0;
+ if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ return 0;
+
+ return maxpages;
+}
+
+static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info,
+ unsigned long maxpages,
+ sector_t *span)
+{
+ int i;
+ unsigned int nr_good_pages;
+ int nr_extents;
+ unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+ unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+
+ nr_good_pages = maxpages - 1; /* omit header page */
+
+ cluster_set_null(&p->free_cluster_head);
+ cluster_set_null(&p->free_cluster_tail);
+ cluster_set_null(&p->discard_cluster_head);
+ cluster_set_null(&p->discard_cluster_tail);
+
+ for (i = 0; i < swap_header->info.nr_badpages; i++) {
+ unsigned int page_nr = swap_header->info.badpages[i];
+ if (page_nr == 0 || page_nr > swap_header->info.last_page)
+ return -EINVAL;
+ if (page_nr < maxpages) {
+ swap_map[page_nr] = SWAP_MAP_BAD;
+ nr_good_pages--;
+ /*
+ * Haven't marked the cluster free yet, no list
+ * operation involved
+ */
+ inc_cluster_info_page(p, cluster_info, page_nr);
+ }
+ }
+
+ /* Haven't marked the cluster free yet, no list operation involved */
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+ inc_cluster_info_page(p, cluster_info, i);
+
+ if (nr_good_pages) {
+ swap_map[0] = SWAP_MAP_BAD;
+ /*
+ * Not mark the cluster free yet, no list
+ * operation involved
+ */
+ inc_cluster_info_page(p, cluster_info, 0);
+ p->max = maxpages;
+ p->pages = nr_good_pages;
+ nr_extents = setup_swap_extents(p, span);
+ if (nr_extents < 0)
+ return nr_extents;
+ nr_good_pages = p->pages;
+ }
+ if (!nr_good_pages) {
+ pr_warn("Empty swap-file\n");
+ return -EINVAL;
+ }
+
+ if (!cluster_info)
+ return nr_extents;
+
+ for (i = 0; i < nr_clusters; i++) {
+ if (!cluster_count(&cluster_info[idx])) {
+ cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+ if (cluster_is_null(&p->free_cluster_head)) {
+ cluster_set_next_flag(&p->free_cluster_head,
+ idx, 0);
+ cluster_set_next_flag(&p->free_cluster_tail,
+ idx, 0);
+ } else {
+ unsigned int tail;
+
+ tail = cluster_next(&p->free_cluster_tail);
+ cluster_set_next(&cluster_info[tail], idx);
+ cluster_set_next_flag(&p->free_cluster_tail,
+ idx, 0);
+ }
+ }
+ idx++;
+ if (idx == nr_clusters)
+ idx = 0;
+ }
+ return nr_extents;
+}
+
/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
*/
-asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+static bool swap_discardable(struct swap_info_struct *si)
{
- struct swap_info_struct * p;
- char *name = NULL;
- struct block_device *bdev = NULL;
+ struct request_queue *q = bdev_get_queue(si->bdev);
+
+ if (!q || !blk_queue_discard(q))
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+ struct swap_info_struct *p;
+ struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
- unsigned int type;
- int i, prev;
+ int i;
+ int prio;
int error;
- static int least_priority;
- union swap_header *swap_header = NULL;
- int swap_header_version;
- unsigned int nr_good_pages = 0;
- int nr_extents = 0;
+ union swap_header *swap_header;
+ int nr_extents;
sector_t span;
- unsigned long maxpages = 1;
- int swapfilesize;
- unsigned short *swap_map;
+ unsigned long maxpages;
+ unsigned char *swap_map = NULL;
+ struct swap_cluster_info *cluster_info = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
- int did_down = 0;
+
+ if (swap_flags & ~SWAP_FLAGS_VALID)
+ return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- spin_lock(&swap_lock);
- p = swap_info;
- for (type = 0 ; type < nr_swapfiles ; type++,p++)
- if (!(p->flags & SWP_USED))
- break;
- error = -EPERM;
- if (type >= MAX_SWAPFILES) {
- spin_unlock(&swap_lock);
- goto out;
- }
- if (type >= nr_swapfiles)
- nr_swapfiles = type+1;
- INIT_LIST_HEAD(&p->extent_list);
- p->flags = SWP_USED;
- p->swap_file = NULL;
- p->old_block_size = 0;
- p->swap_map = NULL;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- p->cluster_nr = 0;
- p->inuse_pages = 0;
- p->next = -1;
- if (swap_flags & SWAP_FLAG_PREFER) {
- p->prio =
- (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
- } else {
- p->prio = --least_priority;
- }
- spin_unlock(&swap_lock);
+
+ p = alloc_swap_info();
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ INIT_WORK(&p->discard_work, swap_discard_work);
+
name = getname(specialfile);
- error = PTR_ERR(name);
if (IS_ERR(name)) {
+ error = PTR_ERR(name);
name = NULL;
- goto bad_swap_2;
+ goto bad_swap;
}
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
- error = PTR_ERR(swap_file);
+ swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swap_file)) {
+ error = PTR_ERR(swap_file);
swap_file = NULL;
- goto bad_swap_2;
+ goto bad_swap;
}
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
- inode = mapping->host;
- error = -EBUSY;
for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *q = &swap_info[i];
+ struct swap_info_struct *q = swap_info[i];
- if (i == type || !q->swap_file)
+ if (q == p || !q->swap_file)
continue;
- if (mapping == q->swap_file->f_mapping)
- goto bad_swap;
- }
-
- error = -EINVAL;
- if (S_ISBLK(inode->i_mode)) {
- bdev = I_BDEV(inode);
- error = bd_claim(bdev, sys_swapon);
- if (error < 0) {
- bdev = NULL;
- error = -EINVAL;
- goto bad_swap;
- }
- p->old_block_size = block_size(bdev);
- error = set_blocksize(bdev, PAGE_SIZE);
- if (error < 0)
- goto bad_swap;
- p->bdev = bdev;
- } else if (S_ISREG(inode->i_mode)) {
- p->bdev = inode->i_sb->s_bdev;
- mutex_lock(&inode->i_mutex);
- did_down = 1;
- if (IS_SWAPFILE(inode)) {
+ if (mapping == q->swap_file->f_mapping) {
error = -EBUSY;
goto bad_swap;
}
- } else {
- goto bad_swap;
}
- swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+ inode = mapping->host;
+ /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+ error = claim_swapfile(p, inode);
+ if (unlikely(error))
+ goto bad_swap;
/*
* Read the swap header.
@@ -1482,161 +2419,137 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
error = PTR_ERR(page);
goto bad_swap;
}
- wait_on_page_locked(page);
- if (!PageUptodate(page))
- goto bad_swap;
- kmap(page);
- swap_header = page_address(page);
+ swap_header = kmap(page);
- if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
- swap_header_version = 1;
- else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
- swap_header_version = 2;
- else {
- printk(KERN_ERR "Unable to find swap-space signature\n");
+ maxpages = read_swap_header(p, swap_header, inode);
+ if (unlikely(!maxpages)) {
error = -EINVAL;
goto bad_swap;
}
-
- switch (swap_header_version) {
- case 1:
- printk(KERN_ERR "version 0 swap is no longer supported. "
- "Use mkswap -v1 %s\n", name);
- error = -EINVAL;
- goto bad_swap;
- case 2:
- /* Check the swap header's sub-version and the size of
- the swap file and bad block lists */
- if (swap_header->info.version != 1) {
- printk(KERN_WARNING
- "Unable to handle swap header version %d\n",
- swap_header->info.version);
- error = -EINVAL;
- goto bad_swap;
- }
-
- p->lowest_bit = 1;
- p->cluster_next = 1;
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vzalloc(maxpages);
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+ if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ p->flags |= SWP_SOLIDSTATE;
/*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number of
- * bits for the swap offset in the swp_entry_t type and
- * 2) the number of bits in the a swap pte as defined by
- * the different architectures. In order to find the
- * largest possible bit mask a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
+ * select a random position to start with to help wear leveling
+ * SSD
*/
- maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
- if (maxpages > swap_header->info.last_page)
- maxpages = swap_header->info.last_page;
- p->highest_bit = maxpages - 1;
+ p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
- error = -EINVAL;
- if (!maxpages)
- goto bad_swap;
- if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
- goto bad_swap;
- if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+ cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
+ SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+ if (!cluster_info) {
+ error = -ENOMEM;
goto bad_swap;
-
- /* OK, set up the swap map and apply the bad block list */
- if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+ }
+ p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!p->percpu_cluster) {
error = -ENOMEM;
goto bad_swap;
}
-
- error = 0;
- memset(p->swap_map, 0, maxpages * sizeof(short));
- for (i = 0; i < swap_header->info.nr_badpages; i++) {
- int page_nr = swap_header->info.badpages[i];
- if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
- error = -EINVAL;
- else
- p->swap_map[page_nr] = SWAP_MAP_BAD;
+ for_each_possible_cpu(i) {
+ struct percpu_cluster *cluster;
+ cluster = per_cpu_ptr(p->percpu_cluster, i);
+ cluster_set_null(&cluster->index);
}
- nr_good_pages = swap_header->info.last_page -
- swap_header->info.nr_badpages -
- 1 /* header page */;
- if (error)
- goto bad_swap;
}
- if (swapfilesize && maxpages > swapfilesize) {
- printk(KERN_WARNING
- "Swap area shorter than signature indicates\n");
- error = -EINVAL;
+ error = swap_cgroup_swapon(p->type, maxpages);
+ if (error)
goto bad_swap;
- }
- if (nr_good_pages) {
- p->swap_map[0] = SWAP_MAP_BAD;
- p->max = maxpages;
- p->pages = nr_good_pages;
- nr_extents = setup_swap_extents(p, &span);
- if (nr_extents < 0) {
- error = nr_extents;
- goto bad_swap;
- }
- nr_good_pages = p->pages;
- }
- if (!nr_good_pages) {
- printk(KERN_WARNING "Empty swap-file\n");
- error = -EINVAL;
+
+ nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+ cluster_info, maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled)
+ frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
- mutex_lock(&swapon_mutex);
- spin_lock(&swap_lock);
- p->flags = SWP_ACTIVE;
- nr_swap_pages += nr_good_pages;
- total_swap_pages += nr_good_pages;
-
- printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk\n",
- nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
- nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
-
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
- if (p->prio >= swap_info[i].prio) {
- break;
+ if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ /*
+ * When discard is enabled for swap with no particular
+ * policy flagged, we set all swap discard flags here in
+ * order to sustain backward compatibility with older
+ * swapon(8) releases.
+ */
+ p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+ SWP_PAGE_DISCARD);
+
+ /*
+ * By flagging sys_swapon, a sysadmin can tell us to
+ * either do single-time area discards only, or to just
+ * perform discards for released swap page-clusters.
+ * Now it's time to adjust the p->flags accordingly.
+ */
+ if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+ p->flags &= ~SWP_PAGE_DISCARD;
+ else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+ p->flags &= ~SWP_AREA_DISCARD;
+
+ /* issue a swapon-time discard if it's still required */
+ if (p->flags & SWP_AREA_DISCARD) {
+ int err = discard_swap(p);
+ if (unlikely(err))
+ pr_err("swapon: discard_swap(%p): %d\n",
+ p, err);
}
- prev = i;
- }
- p->next = i;
- if (prev < 0) {
- swap_list.head = swap_list.next = p - swap_info;
- } else {
- swap_info[prev].next = p - swap_info;
}
- spin_unlock(&swap_lock);
+
+ mutex_lock(&swapon_mutex);
+ prio = -1;
+ if (swap_flags & SWAP_FLAG_PREFER)
+ prio =
+ (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
+
+ pr_info("Adding %uk swap on %s. "
+ "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
+ nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+ (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (p->flags & SWP_AREA_DISCARD) ? "s" : "",
+ (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
+ (frontswap_map) ? "FS" : "");
+
mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
+ if (S_ISREG(inode->i_mode))
+ inode->i_flags |= S_SWAPFILE;
error = 0;
goto out;
bad_swap:
- if (bdev) {
- set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ free_percpu(p->percpu_cluster);
+ p->percpu_cluster = NULL;
+ if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+ set_blocksize(p->bdev, p->old_block_size);
+ blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
-bad_swap_2:
+ swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
- swap_map = p->swap_map;
p->swap_file = NULL;
- p->swap_map = NULL;
p->flags = 0;
- if (!(swap_flags & SWAP_FLAG_PREFER))
- ++least_priority;
spin_unlock(&swap_lock);
vfree(swap_map);
- if (swap_file)
+ vfree(cluster_info);
+ if (swap_file) {
+ if (inode && S_ISREG(inode->i_mode)) {
+ mutex_unlock(&inode->i_mutex);
+ inode = NULL;
+ }
filp_close(swap_file, NULL);
+ }
out:
if (page && !IS_ERR(page)) {
kunmap(page);
@@ -1644,27 +2557,24 @@ out:
}
if (name)
putname(name);
- if (did_down) {
- if (!error)
- inode->i_flags |= S_SWAPFILE;
+ if (inode && S_ISREG(inode->i_mode))
mutex_unlock(&inode->i_mutex);
- }
return error;
}
void si_swapinfo(struct sysinfo *val)
{
- unsigned int i;
+ unsigned int type;
unsigned long nr_to_be_unused = 0;
spin_lock(&swap_lock);
- for (i = 0; i < nr_swapfiles; i++) {
- if (!(swap_info[i].flags & SWP_USED) ||
- (swap_info[i].flags & SWP_WRITEOK))
- continue;
- nr_to_be_unused += swap_info[i].inuse_pages;
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *si = swap_info[type];
+
+ if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
+ nr_to_be_unused += si->inuse_pages;
}
- val->freeswap = nr_swap_pages + nr_to_be_unused;
+ val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
spin_unlock(&swap_lock);
}
@@ -1672,81 +2582,360 @@ void si_swapinfo(struct sysinfo *val)
/*
* Verify that a swap entry is valid and increment its swap map count.
*
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
+ * Returns error code in following case.
+ * - success -> 0
+ * - swp_entry is invalid -> EINVAL
+ * - swp_entry is migration entry -> EINVAL
+ * - swap-cache reference is requested but there is already one. -> EEXIST
+ * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
-int swap_duplicate(swp_entry_t entry)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
- struct swap_info_struct * p;
+ struct swap_info_struct *p;
unsigned long offset, type;
- int result = 0;
+ unsigned char count;
+ unsigned char has_cache;
+ int err = -EINVAL;
- if (is_migration_entry(entry))
- return 1;
+ if (non_swap_entry(entry))
+ goto out;
type = swp_type(entry);
if (type >= nr_swapfiles)
goto bad_file;
- p = type + swap_info;
+ p = swap_info[type];
offset = swp_offset(entry);
- spin_lock(&swap_lock);
- if (offset < p->max && p->swap_map[offset]) {
- if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
- p->swap_map[offset]++;
- result = 1;
- } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
- if (swap_overflow++ < 5)
- printk(KERN_WARNING "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = SWAP_MAP_MAX;
- result = 1;
- }
+ spin_lock(&p->lock);
+ if (unlikely(offset >= p->max))
+ goto unlock_out;
+
+ count = p->swap_map[offset];
+
+ /*
+ * swapin_readahead() doesn't check if a swap entry is valid, so the
+ * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+ */
+ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+ err = -ENOENT;
+ goto unlock_out;
}
- spin_unlock(&swap_lock);
+
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+ err = 0;
+
+ if (usage == SWAP_HAS_CACHE) {
+
+ /* set SWAP_HAS_CACHE if there is no cache and entry is used */
+ if (!has_cache && count)
+ has_cache = SWAP_HAS_CACHE;
+ else if (has_cache) /* someone else added cache */
+ err = -EEXIST;
+ else /* no users remaining */
+ err = -ENOENT;
+
+ } else if (count || has_cache) {
+
+ if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ count += usage;
+ else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
+ err = -EINVAL;
+ else if (swap_count_continued(p, offset, count))
+ count = COUNT_CONTINUED;
+ else
+ err = -ENOMEM;
+ } else
+ err = -ENOENT; /* unused swap entry */
+
+ p->swap_map[offset] = count | has_cache;
+
+unlock_out:
+ spin_unlock(&p->lock);
out:
- return result;
+ return err;
bad_file:
- printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+ pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
}
-struct swap_info_struct *
-get_swap_info_struct(unsigned type)
+/*
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
{
- return &swap_info[type];
+ __swap_duplicate(entry, SWAP_MAP_SHMEM);
}
/*
- * swap_lock prevents swap_map being freed. Don't grab an extra
- * reference on the swaphandle, it doesn't matter if it becomes unused.
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated. Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
*/
-int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
+int swap_duplicate(swp_entry_t entry)
{
- int ret = 0, i = 1 << page_cluster;
- unsigned long toff;
- struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+ int err = 0;
- if (!page_cluster) /* no readahead */
- return 0;
- toff = (swp_offset(entry) >> page_cluster) << page_cluster;
- if (!toff) /* first page is swap header */
- toff++, i--;
- *offset = toff;
+ while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+ err = add_swap_count_continuation(entry, GFP_ATOMIC);
+ return err;
+}
- spin_lock(&swap_lock);
- do {
- /* Don't read-ahead past the end of the swap area */
- if (toff >= swapdev->max)
- break;
- /* Don't read in free or bad pages */
- if (!swapdev->swap_map[toff])
- break;
- if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
- break;
- toff++;
- ret++;
- } while (--i);
- spin_unlock(&swap_lock);
- return ret;
+/*
+ * @entry: swap entry for which we allocate swap cache.
+ *
+ * Called when allocating swap cache for existing swap entry,
+ * This can return error codes. Returns 0 at success.
+ * -EBUSY means there is a swap cache.
+ * Note: return code is different from swap_duplicate().
+ */
+int swapcache_prepare(swp_entry_t entry)
+{
+ return __swap_duplicate(entry, SWAP_HAS_CACHE);
+}
+
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ BUG_ON(!PageSwapCache(page));
+ return swap_info[swp_type(swap)];
+}
+
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+
+pgoff_t __page_file_index(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
+
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+ struct swap_info_struct *si;
+ struct page *head;
+ struct page *page;
+ struct page *list_page;
+ pgoff_t offset;
+ unsigned char count;
+
+ /*
+ * When debugging, it's easier to use __GFP_ZERO here; but it's better
+ * for latency not to zero a page while GFP_ATOMIC and holding locks.
+ */
+ page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+
+ si = swap_info_get(entry);
+ if (!si) {
+ /*
+ * An acceptable race has occurred since the failing
+ * __swap_duplicate(): the swap entry has been freed,
+ * perhaps even the whole swap_map cleared for swapoff.
+ */
+ goto outer;
+ }
+
+ offset = swp_offset(entry);
+ count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+
+ if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+ /*
+ * The higher the swap count, the more likely it is that tasks
+ * will race to add swap count continuation: we need to avoid
+ * over-provisioning.
+ */
+ goto out;
+ }
+
+ if (!page) {
+ spin_unlock(&si->lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+ * no architecture is using highmem pages for kernel page tables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
+ */
+ head = vmalloc_to_page(si->swap_map + offset);
+ offset &= ~PAGE_MASK;
+
+ /*
+ * Page allocation does not initialize the page's lru field,
+ * but it does always reset its private field.
+ */
+ if (!page_private(head)) {
+ BUG_ON(count & COUNT_CONTINUED);
+ INIT_LIST_HEAD(&head->lru);
+ set_page_private(head, SWP_CONTINUED);
+ si->flags |= SWP_CONTINUED;
+ }
+
+ list_for_each_entry(list_page, &head->lru, lru) {
+ unsigned char *map;
+
+ /*
+ * If the previous map said no continuation, but we've found
+ * a continuation page, free our allocation and use this one.
+ */
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ map = kmap_atomic(list_page) + offset;
+ count = *map;
+ kunmap_atomic(map);
+
+ /*
+ * If this continuation count now has some space in it,
+ * free our allocation and use this one.
+ */
+ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+ goto out;
+ }
+
+ list_add_tail(&page->lru, &head->lru);
+ page = NULL; /* now it's attached, don't free it */
+out:
+ spin_unlock(&si->lock);
+outer:
+ if (page)
+ __free_page(page);
+ return 0;
+}
+
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+ pgoff_t offset, unsigned char count)
+{
+ struct page *head;
+ struct page *page;
+ unsigned char *map;
+
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head) != SWP_CONTINUED) {
+ BUG_ON(count & COUNT_CONTINUED);
+ return false; /* need to add count continuation */
+ }
+
+ offset &= ~PAGE_MASK;
+ page = list_entry(head->lru.next, struct page, lru);
+ map = kmap_atomic(page) + offset;
+
+ if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
+ goto init_map; /* jump over SWAP_CONT_MAX checks */
+
+ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+ /*
+ * Think of how you add 1 to 999
+ */
+ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+ kunmap_atomic(map);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page) + offset;
+ }
+ if (*map == SWAP_CONT_MAX) {
+ kunmap_atomic(map);
+ page = list_entry(page->lru.next, struct page, lru);
+ if (page == head)
+ return false; /* add count continuation */
+ map = kmap_atomic(page) + offset;
+init_map: *map = 0; /* we didn't zero the page */
+ }
+ *map += 1;
+ kunmap_atomic(map);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page) + offset;
+ *map = COUNT_CONTINUED;
+ kunmap_atomic(map);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return true; /* incremented */
+
+ } else { /* decrementing */
+ /*
+ * Think of how you subtract 1 from 1000
+ */
+ BUG_ON(count != COUNT_CONTINUED);
+ while (*map == COUNT_CONTINUED) {
+ kunmap_atomic(map);
+ page = list_entry(page->lru.next, struct page, lru);
+ BUG_ON(page == head);
+ map = kmap_atomic(page) + offset;
+ }
+ BUG_ON(*map == 0);
+ *map -= 1;
+ if (*map == 0)
+ count = 0;
+ kunmap_atomic(map);
+ page = list_entry(page->lru.prev, struct page, lru);
+ while (page != head) {
+ map = kmap_atomic(page) + offset;
+ *map = SWAP_CONT_MAX | count;
+ count = COUNT_CONTINUED;
+ kunmap_atomic(map);
+ page = list_entry(page->lru.prev, struct page, lru);
+ }
+ return count == COUNT_CONTINUED;
+ }
+}
+
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+ pgoff_t offset;
+
+ for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+ struct page *head;
+ head = vmalloc_to_page(si->swap_map + offset);
+ if (page_private(head)) {
+ struct list_head *this, *next;
+ list_for_each_safe(this, next, &head->lru) {
+ struct page *page;
+ page = list_entry(this, struct page, lru);
+ list_del(this);
+ __free_page(page);
+ }
+ }
+ }
}
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index f4c560b4a2b..00000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cs.wm.edu/~sjiang/token.pdf
- */
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-
-static DEFINE_SPINLOCK(swap_token_lock);
-static unsigned long swap_token_timeout;
-static unsigned long swap_token_check;
-struct mm_struct * swap_token_mm = &init_mm;
-
-#define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2)
-#define SWAP_TOKEN_TIMEOUT (300 * HZ)
-/*
- * Currently disabled; Needs further code to work at HZ * 300.
- */
-unsigned long swap_token_default_timeout = SWAP_TOKEN_TIMEOUT;
-
-/*
- * Take the token away if the process had no page faults
- * in the last interval, or if it has held the token for
- * too long.
- */
-#define SWAP_TOKEN_ENOUGH_RSS 1
-#define SWAP_TOKEN_TIMED_OUT 2
-static int should_release_swap_token(struct mm_struct *mm)
-{
- int ret = 0;
- if (!mm->recent_pagein)
- ret = SWAP_TOKEN_ENOUGH_RSS;
- else if (time_after(jiffies, swap_token_timeout))
- ret = SWAP_TOKEN_TIMED_OUT;
- mm->recent_pagein = 0;
- return ret;
-}
-
-/*
- * Try to grab the swapout protection token. We only try to
- * grab it once every TOKEN_CHECK_INTERVAL, both to prevent
- * SMP lock contention and to check that the process that held
- * the token before is no longer thrashing.
- */
-void grab_swap_token(void)
-{
- struct mm_struct *mm;
- int reason;
-
- /* We have the token. Let others know we still need it. */
- if (has_swap_token(current->mm)) {
- current->mm->recent_pagein = 1;
- if (unlikely(!swap_token_default_timeout))
- disable_swap_token();
- return;
- }
-
- if (time_after(jiffies, swap_token_check)) {
-
- if (!swap_token_default_timeout) {
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- return;
- }
-
- /* ... or if we recently held the token. */
- if (time_before(jiffies, current->mm->swap_token_time))
- return;
-
- if (!spin_trylock(&swap_token_lock))
- return;
-
- swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
-
- mm = swap_token_mm;
- if ((reason = should_release_swap_token(mm))) {
- unsigned long eligible = jiffies;
- if (reason == SWAP_TOKEN_TIMED_OUT) {
- eligible += swap_token_default_timeout;
- }
- mm->swap_token_time = eligible;
- swap_token_timeout = jiffies + swap_token_default_timeout;
- swap_token_mm = current->mm;
- }
- spin_unlock(&swap_token_lock);
- }
- return;
-}
-
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
- spin_lock(&swap_token_lock);
- if (likely(mm == swap_token_mm)) {
- mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
- swap_token_mm = &init_mm;
- swap_token_check = jiffies;
- }
- spin_unlock(&swap_token_lock);
-}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 5f2cbf0f153..00000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
- *
- * Matt Mackall <mpm@selenic.com> January, 2004
- * derived from mm/shmem.c and fs/ramfs/inode.c
- *
- * This is intended for small system where the benefits of the full
- * shmem code (swap-backed and resource-limited) are outweighed by
- * their complexity. On systems without swap this code should be
- * effectively equivalent, but much lighter weight.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/ramfs.h>
-
-static struct file_system_type tmpfs_fs_type = {
- .name = "tmpfs",
- .get_sb = ramfs_get_sb,
- .kill_sb = kill_litter_super,
-};
-
-static struct vfsmount *shm_mnt;
-
-static int __init init_tmpfs(void)
-{
- BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
-
- shm_mnt = kern_mount(&tmpfs_fs_type);
- BUG_ON(IS_ERR(shm_mnt));
-
- return 0;
-}
-module_init(init_tmpfs)
-
-/*
- * shmem_file_setup - get an unlinked file living in tmpfs
- *
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- *
- */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
-{
- int error;
- struct file *file;
- struct inode *inode;
- struct dentry *dentry, *root;
- struct qstr this;
-
- if (IS_ERR(shm_mnt))
- return (void *)shm_mnt;
-
- error = -ENOMEM;
- this.name = name;
- this.len = strlen(name);
- this.hash = 0; /* will go */
- root = shm_mnt->mnt_root;
- dentry = d_alloc(root, &this);
- if (!dentry)
- goto put_memory;
-
- error = -ENFILE;
- file = get_empty_filp();
- if (!file)
- goto put_dentry;
-
- error = -ENOSPC;
- inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
- if (!inode)
- goto close_file;
-
- d_instantiate(dentry, inode);
- inode->i_nlink = 0; /* It is unlinked */
-
- file->f_vfsmnt = mntget(shm_mnt);
- file->f_dentry = dentry;
- file->f_mapping = inode->i_mapping;
- file->f_op = &ramfs_file_operations;
- file->f_mode = FMODE_WRITE | FMODE_READ;
-
- /* notify everyone as to the change of file size */
- error = do_truncate(dentry, size, 0, file);
- if (error < 0)
- goto close_file;
-
- return file;
-
-close_file:
- put_filp(file);
-put_dentry:
- dput(dentry);
-put_memory:
- return ERR_PTR(error);
-}
-
-/*
- * shmem_zero_setup - setup a shared anonymous mapping
- *
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
-{
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
-
- file = shmem_file_setup("dev/zero", size, vma->vm_flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- if (vma->vm_file)
- fput(vma->vm_file);
- vma->vm_file = file;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-int shmem_unuse(swp_entry_t entry, struct page *page)
-{
- return 0;
-}
-
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
- file_accessed(file);
-#ifndef CONFIG_MMU
- return ramfs_nommu_mmap(file, vma);
-#else
- return 0;
-#endif
-}
-
-#ifndef CONFIG_MMU
-unsigned long shmem_get_unmapped_area(struct file *file,
- unsigned long addr,
- unsigned long len,
- unsigned long pgoff,
- unsigned long flags)
-{
- return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
-}
-#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index a654928323d..eda24730716 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -3,55 +3,151 @@
*
* Copyright (C) 2002, Linus Torvalds
*
- * 10Sep2002 akpm@zip.com.au
+ * 10Sep2002 Andrew Morton
* Initial version.
*/
#include <linux/kernel.h>
+#include <linux/backing-dev.h>
+#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/pagemap.h>
+#include <linux/highmem.h>
#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
+#include <linux/cleancache.h>
+#include "internal.h"
+static void clear_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ struct radix_tree_node *node;
+ void **slot;
+
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrshadows--;
+ if (!node)
+ goto unlock;
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes, &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+unlock:
+ spin_unlock_irq(&mapping->tree_lock);
+}
+
+/**
+ * do_invalidatepage - invalidate part or all of a page
+ * @page: the page which is affected
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point. Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned int offset,
+ unsigned int length)
+{
+ void (*invalidatepage)(struct page *, unsigned int, unsigned int);
+
+ invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
+ if (!invalidatepage)
+ invalidatepage = block_invalidatepage;
+#endif
+ if (invalidatepage)
+ (*invalidatepage)(page, offset, length);
+}
-static inline void truncate_partial_page(struct page *page, unsigned partial)
+/*
+ * This cancels just the dirty bit on the kernel page itself, it
+ * does NOT actually remove dirty bits on any mmap's that may be
+ * around. It also leaves the page tagged dirty, so any sync
+ * activity will still find it on the dirty lists, and in particular,
+ * clear_page_dirty_for_io() will still look at the dirty bits in
+ * the VM.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
+ */
+void cancel_dirty_page(struct page *page, unsigned int account_size)
{
- memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
- if (PagePrivate(page))
- do_invalidatepage(page, partial);
+ if (TestClearPageDirty(page)) {
+ struct address_space *mapping = page->mapping;
+ if (mapping && mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ if (account_size)
+ task_io_account_cancelled_write(account_size);
+ }
+ }
}
+EXPORT_SYMBOL(cancel_dirty_page);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous. It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
+ * becomes orphaned. It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_fault().
*
* We need to bale out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
- * its lock, b) when a concurrent invalidate_inode_pages got there first and
+ * its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
+static int
truncate_complete_page(struct address_space *mapping, struct page *page)
{
if (page->mapping != mapping)
- return;
+ return -EIO;
- if (PagePrivate(page))
- do_invalidatepage(page, 0);
+ if (page_has_private(page))
+ do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
- clear_page_dirty(page);
- ClearPageUptodate(page);
ClearPageMappedToDisk(page);
- remove_from_page_cache(page);
- page_cache_release(page); /* pagecache ref */
+ delete_from_page_cache(page);
+ return 0;
}
/*
- * This is for invalidate_inode_pages(). That function can be called at
+ * This is for invalidate_mapping_pages(). That function can be called at
* any time, and is not supposed to throw away dirty pages. But pages can
* be marked dirty at any time too, so use remove_mapping which safely
* discards clean, unused pages.
@@ -66,25 +162,68 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return 0;
- if (PagePrivate(page) && !try_to_release_page(page, 0))
+ if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
ret = remove_mapping(mapping, page);
- ClearPageUptodate(page);
return ret;
}
+int truncate_inode_page(struct address_space *mapping, struct page *page)
+{
+ if (page_mapped(page)) {
+ unmap_mapping_range(mapping,
+ (loff_t)page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ }
+ return truncate_complete_page(mapping, page);
+}
+
+/*
+ * Used to get rid of pages on hardware memory corruption.
+ */
+int generic_error_remove_page(struct address_space *mapping, struct page *page)
+{
+ if (!mapping)
+ return -EINVAL;
+ /*
+ * Only punch for normal data pages for now.
+ * Handling other types like directories would need more auditing.
+ */
+ if (!S_ISREG(mapping->host->i_mode))
+ return -EIO;
+ return truncate_inode_page(mapping, page);
+}
+EXPORT_SYMBOL(generic_error_remove_page);
+
+/*
+ * Safely invalidate one page from its pagecache mapping.
+ * It only drops clean, unused pages. The page must be locked.
+ *
+ * Returns 1 if the page is successfully invalidated, otherwise 0.
+ */
+int invalidate_inode_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+ if (PageDirty(page) || PageWriteback(page))
+ return 0;
+ if (page_mapped(page))
+ return 0;
+ return invalidate_complete_page(mapping, page);
+}
+
/**
- * truncate_inode_pages - truncate range of pages specified by start and
- * end byte offsets
+ * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
- * @lend: offset to which to truncate
+ * @lend: offset to which to truncate (inclusive)
*
* Truncate the page cache, removing the pages that are between
- * specified offsets (and zeroing out partial page
- * (if lstart is not page aligned)).
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
@@ -92,96 +231,173 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
- * When looking at page->index outside the page lock we need to be careful to
- * copy it into a local to avoid races (it could change at any time).
- *
* We pass down the cache-hot hint to the page freeing code. Even if the
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
+ *
+ * Note that since ->invalidatepage() accepts range to invalidate
+ * truncate_inode_pages_range is able to handle cases where lend + 1 is not
+ * page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- pgoff_t end;
- const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
- struct pagevec pvec;
- pgoff_t next;
- int i;
-
- if (mapping->nrpages == 0)
+ pgoff_t start; /* inclusive */
+ pgoff_t end; /* exclusive */
+ unsigned int partial_start; /* inclusive */
+ unsigned int partial_end; /* exclusive */
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index;
+ int i;
+
+ cleancache_invalidate_inode(mapping);
+ if (mapping->nrpages == 0 && mapping->nrshadows == 0)
return;
- BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
- end = (lend >> PAGE_CACHE_SHIFT);
+ /* Offsets within partial pages */
+ partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * 'start' and 'end' always covers the range of pages to be fully
+ * truncated. Partial pages are covered with 'partial_start' at the
+ * start of the range and 'partial_end' at the end of the range.
+ * Note that 'end' is exclusive while 'lend' is inclusive.
+ */
+ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (lend == -1)
+ /*
+ * lend == -1 indicates end-of-file so we have to set 'end'
+ * to the highest possible pgoff_t and since the type is
+ * unsigned we're using -1.
+ */
+ end = -1;
+ else
+ end = (lend + 1) >> PAGE_CACHE_SHIFT;
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ index = start;
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index = page->index;
- if (page_index > end) {
- next = page_index;
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end)
break;
+
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
}
- if (page_index > next)
- next = page_index;
- next++;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
continue;
+ WARN_ON(page->index != index);
if (PageWriteback(page)) {
unlock_page(page);
continue;
}
- truncate_complete_page(mapping, page);
+ truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
- if (partial) {
+ if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
+ unsigned int top = PAGE_CACHE_SIZE;
+ if (start > end) {
+ /* Truncation within a single page */
+ top = partial_end;
+ partial_end = 0;
+ }
wait_on_page_writeback(page);
- truncate_partial_page(page, partial);
+ zero_user_segment(page, partial_start, top);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, partial_start,
+ top - partial_start);
unlock_page(page);
page_cache_release(page);
}
}
+ if (partial_end) {
+ struct page *page = find_lock_page(mapping, end);
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user_segment(page, 0, partial_end);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, 0,
+ partial_end);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ /*
+ * If the truncation happened within a single page no pages
+ * will be released, just zeroed, so we can bail out now.
+ */
+ if (start >= end)
+ return;
- next = start;
+ index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
- if (next == start)
+ if (!pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+ /* If all gone from start onwards, we're done */
+ if (index == start)
break;
- next = start;
+ /* Otherwise restart to make sure all gone */
+ index = start;
continue;
}
- if (pvec.pages[0]->index > end) {
+ if (index == start && indices[0] >= end) {
+ /* All gone out of hole to be punched, we're done */
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
break;
}
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end) {
+ /* Restart punch to make sure all gone */
+ index = start - 1;
break;
+ }
+
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
lock_page(page);
+ WARN_ON(page->index != index);
wait_on_page_writeback(page);
- if (page->index > next)
- next = page->index;
- next++;
- truncate_complete_page(mapping, page);
+ truncate_inode_page(mapping, page);
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ index++;
}
+ cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -191,6 +407,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* @lstart: offset from which to truncate
*
* Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
@@ -199,6 +420,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
EXPORT_SYMBOL(truncate_inode_pages);
/**
+ * truncate_inode_pages_final - truncate *all* pages before inode dies
+ * @mapping: mapping to truncate
+ *
+ * Called under (and serialized by) inode->i_mutex.
+ *
+ * Filesystems have to use this in the .evict_inode path to inform the
+ * VM that this is the final truncate and the inode is going away.
+ */
+void truncate_inode_pages_final(struct address_space *mapping)
+{
+ unsigned long nrshadows;
+ unsigned long nrpages;
+
+ /*
+ * Page reclaim can not participate in regular inode lifetime
+ * management (can't call iput()) and thus can race with the
+ * inode teardown. Tell it when the address space is exiting,
+ * so that it does not install eviction information after the
+ * final truncate has begun.
+ */
+ mapping_set_exiting(mapping);
+
+ /*
+ * When reclaim installs eviction entries, it increases
+ * nrshadows first, then decreases nrpages. Make sure we see
+ * this in the right order or we might miss an entry.
+ */
+ nrpages = mapping->nrpages;
+ smp_rmb();
+ nrshadows = mapping->nrshadows;
+
+ if (nrpages || nrshadows) {
+ /*
+ * As truncation uses a lockless tree lookup, cycle
+ * the tree lock to make sure any ongoing tree
+ * modification that does not see AS_EXITING is
+ * completed before starting the final truncate.
+ */
+ spin_lock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ truncate_inode_pages(mapping, 0);
+ }
+}
+EXPORT_SYMBOL(truncate_inode_pages_final);
+
+/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
@@ -212,57 +480,99 @@ EXPORT_SYMBOL(truncate_inode_pages);
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+ pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
- pgoff_t next = start;
- unsigned long ret = 0;
+ pgoff_t index = start;
+ unsigned long ret;
+ unsigned long count = 0;
int i;
pagevec_init(&pvec, 0);
- while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t index;
- int lock_failed;
- lock_failed = TestSetPageLocked(page);
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index > end)
+ break;
- /*
- * We really shouldn't be looking at the ->index of an
- * unlocked page. But we're not allowed to lock these
- * pages. So we rely upon nobody altering the ->index
- * of this (pinned-by-us) page.
- */
- index = page->index;
- if (index > next)
- next = index;
- next++;
- if (lock_failed)
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
continue;
+ }
- if (PageDirty(page) || PageWriteback(page))
- goto unlock;
- if (page_mapped(page))
- goto unlock;
- ret += invalidate_complete_page(mapping, page);
-unlock:
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
+ ret = invalidate_inode_page(page);
unlock_page(page);
- if (next > end)
- break;
+ /*
+ * Invalidation is a hint that the page is no longer
+ * of interest and try to speed up its reclaim.
+ */
+ if (!ret)
+ deactivate_page(page);
+ count += ret;
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
+ cond_resched();
+ index++;
}
- return ret;
+ return count;
}
+EXPORT_SYMBOL(invalidate_mapping_pages);
-unsigned long invalidate_inode_pages(struct address_space *mapping)
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount. We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_page_list() has a temp ref on them, or because they're transiently
+ * sitting in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- return invalidate_mapping_pages(mapping, 0, ~0UL);
+ if (page->mapping != mapping)
+ return 0;
+
+ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+ return 0;
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (PageDirty(page))
+ goto failed;
+
+ BUG_ON(page_has_private(page));
+ __delete_from_page_cache(page, NULL);
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
+ page_cache_release(page); /* pagecache ref */
+ return 1;
+failed:
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
}
-EXPORT_SYMBOL(invalidate_inode_pages);
+static int do_launder_page(struct address_space *mapping, struct page *page)
+{
+ if (!PageDirty(page))
+ return 0;
+ if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ return 0;
+ return mapping->a_ops->launder_page(page);
+}
/**
* invalidate_inode_pages2_range - remove range of pages from an address_space
@@ -273,51 +583,55 @@ EXPORT_SYMBOL(invalidate_inode_pages);
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+ pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t index;
int i;
int ret = 0;
+ int ret2 = 0;
int did_range_unmap = 0;
- int wrapped = 0;
+ cleancache_invalidate_inode(mapping);
pagevec_init(&pvec, 0);
- next = start;
- while (next <= end && !ret && !wrapped &&
- pagevec_lookup(&pvec, mapping, next,
- min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
- for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
+ index = start;
+ while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ indices)) {
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- pgoff_t page_index;
- int was_dirty;
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index > end)
+ break;
+
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
lock_page(page);
+ WARN_ON(page->index != index);
if (page->mapping != mapping) {
unlock_page(page);
continue;
}
- page_index = page->index;
- next = page_index + 1;
- if (next == 0)
- wrapped = 1;
- if (page_index > end) {
- unlock_page(page);
- break;
- }
wait_on_page_writeback(page);
- while (page_mapped(page)) {
+ if (page_mapped(page)) {
if (!did_range_unmap) {
/*
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- (loff_t)(end - page_index + 1)
- << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ (loff_t)(1 + end - index)
+ << PAGE_CACHE_SHIFT,
0);
did_range_unmap = 1;
} else {
@@ -325,21 +639,27 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Just zap this page
*/
unmap_mapping_range(mapping,
- (loff_t)page_index<<PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
}
}
- was_dirty = test_clear_page_dirty(page);
- if (!invalidate_complete_page(mapping, page)) {
- if (was_dirty)
- set_page_dirty(page);
- ret = -EIO;
+ BUG_ON(page_mapped(page));
+ ret2 = do_launder_page(mapping, page);
+ if (ret2 == 0) {
+ if (!invalidate_complete_page2(mapping, page))
+ ret2 = -EBUSY;
}
+ if (ret2 < 0)
+ ret = ret2;
unlock_page(page);
}
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
cond_resched();
+ index++;
}
+ cleancache_invalidate_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -351,10 +671,102 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
+
+/**
+ * truncate_pagecache - unmap and remove pagecache that has been truncated
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * inode's new i_size must already be written before truncate_pagecache
+ * is called.
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache(struct inode *inode, loff_t newsize)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t holebegin = round_up(newsize, PAGE_SIZE);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+ truncate_inode_pages(mapping, newsize);
+ unmap_mapping_range(mapping, holebegin, 0, 1);
+}
+EXPORT_SYMBOL(truncate_pagecache);
+
+/**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updates i_size and performs pagecache truncation (if
+ * necessary) to @newsize. It will be typically be called from the filesystem's
+ * setattr function when ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and before all filesystem specific
+ * block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+ i_size_write(inode, newsize);
+ truncate_pagecache(inode, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+
+/**
+ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
+ * @inode: inode
+ * @lstart: offset of beginning of hole
+ * @lend: offset of last byte of hole
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t unmap_start = round_up(lstart, PAGE_SIZE);
+ loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
+ /*
+ * This rounding is currently just for example: unmap_mapping_range
+ * expands its hole outwards, whereas we want it to contract the hole
+ * inwards. However, existing callers of truncate_pagecache_range are
+ * doing their own page rounding first. Note that unmap_mapping_range
+ * allows holelen 0 for all, and we allow lend -1 for end of file.
+ */
+
+ /*
+ * Unlike in truncate_pagecache, unmap_mapping_range is called only
+ * once (before truncating pagecache), and without "even_cows" flag:
+ * hole-punching should not remove private COWed pages from the hole.
+ */
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ truncate_inode_pages_range(mapping, lstart, lend);
+}
+EXPORT_SYMBOL(truncate_pagecache_range);
diff --git a/mm/util.c b/mm/util.c
index 7368479220b..d5ea733c508 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,26 +1,26 @@
+#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/mman.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+
#include <asm/uaccess.h>
-/**
- * __kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *__kzalloc(size_t size, gfp_t flags)
-{
- void *ret = ____kmalloc(size, flags);
- if (ret)
- memset(ret, 0, size);
- return ret;
-}
-EXPORT_SYMBOL(__kzalloc);
+#include "internal.h"
-/*
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
+/**
* kstrdup - allocate space for and copy an existing string
- *
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
@@ -33,16 +33,178 @@ char *kstrdup(const char *s, gfp_t gfp)
return NULL;
len = strlen(s) + 1;
- buf = ____kmalloc(len, gfp);
+ buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
+/**
+ * kstrndup - allocate space for and copy an existing string
+ * @s: the string to duplicate
+ * @max: read at most @max chars from @s
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrndup(const char *s, size_t max, gfp_t gfp)
+{
+ size_t len;
+ char *buf;
+
+ if (!s)
+ return NULL;
+
+ len = strnlen(s, max);
+ buf = kmalloc_track_caller(len+1, gfp);
+ if (buf) {
+ memcpy(buf, s, len);
+ buf[len] = '\0';
+ }
+ return buf;
+}
+EXPORT_SYMBOL(kstrndup);
+
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kmalloc_track_caller(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
+/**
+ * memdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user(const void __user *src, size_t len)
+{
+ void *p;
+
+ /*
+ * Always use GFP_KERNEL, since copy_from_user() can sleep and
+ * cause pagefault, which makes it pointless to use GFP_NOFS
+ * or GFP_ATOMIC.
+ */
+ p = kmalloc_track_caller(len, GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return p;
+}
+EXPORT_SYMBOL(memdup_user);
+
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ gfp_t flags)
+{
+ void *ret;
+ size_t ks = 0;
+
+ if (p)
+ ks = ksize(p);
+
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+}
+
+/**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ if (unlikely(!new_size))
+ return ZERO_SIZE_PTR;
+
+ return __do_krealloc(p, new_size, flags);
+
+}
+EXPORT_SYMBOL(__krealloc);
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes. If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+ void *ret;
+
+ if (unlikely(!new_size)) {
+ kfree(p);
+ return ZERO_SIZE_PTR;
+ }
+
+ ret = __do_krealloc(p, new_size, flags);
+ if (ret && p != ret)
+ kfree(p);
+
+ return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+void kzfree(const void *p)
+{
+ size_t ks;
+ void *mem = (void *)p;
+
+ if (unlikely(ZERO_OR_NULL_PTR(mem)))
+ return;
+ ks = ksize(mem);
+ memset(mem, 0, ks);
+ kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
+
/*
* strndup_user - duplicate an existing string from user space
- *
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*/
@@ -59,18 +221,294 @@ char *strndup_user(const char __user *s, long n)
if (length > n)
return ERR_PTR(-EINVAL);
- p = kmalloc(length, GFP_KERNEL);
+ p = memdup_user(s, length);
- if (!p)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(p, s, length)) {
- kfree(p);
- return ERR_PTR(-EFAULT);
- }
+ if (IS_ERR(p))
+ return p;
p[length - 1] = '\0';
return p;
}
EXPORT_SYMBOL(strndup_user);
+
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
+ if (prev) {
+ next = prev->vm_next;
+ prev->vm_next = vma;
+ } else {
+ mm->mmap = vma;
+ if (rb_parent)
+ next = rb_entry(rb_parent,
+ struct vm_area_struct, vm_rb);
+ else
+ next = NULL;
+ }
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
+}
+
+/* Check if the vma is being used as a stack by this task */
+static int vm_is_stack_for_task(struct task_struct *t,
+ struct vm_area_struct *vma)
+{
+ return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
+}
+
+/*
+ * Check if the vma is being used as a stack.
+ * If is_group is non-zero, check in the entire thread group or else
+ * just check in the current task. Returns the pid of the task that
+ * the vma is stack for.
+ */
+pid_t vm_is_stack(struct task_struct *task,
+ struct vm_area_struct *vma, int in_group)
+{
+ pid_t ret = 0;
+
+ if (vm_is_stack_for_task(task, vma))
+ return task->pid;
+
+ if (in_group) {
+ struct task_struct *t;
+ rcu_read_lock();
+ if (!pid_alive(task))
+ goto done;
+
+ t = task;
+ do {
+ if (vm_is_stack_for_task(t, vma)) {
+ ret = t->pid;
+ goto done;
+ }
+ } while_each_thread(task, t);
+done:
+ rcu_read_unlock();
+ }
+
+ return ret;
+}
+
+#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+ mm->mmap_base = TASK_UNMAPPED_BASE;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+}
+#endif
+
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this function, simply return with no
+ * page pinned
+ */
+int __weak __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ *
+ * get_user_pages_fast provides equivalent functionality to get_user_pages,
+ * operating on current and current->mm, with force=0 and vma=NULL. However
+ * unlike get_user_pages, it must be called without mmap_sem held.
+ *
+ * get_user_pages_fast may take mmap_sem and page table locks, so no
+ * assumptions can be made about lack of locking. get_user_pages_fast is to be
+ * implemented in a way that is advantageous (vs get_user_pages()) when the
+ * user memory area is already faulted in and present in ptes. However if the
+ * pages have to be faulted in, it may turn out to be slightly slower so
+ * callers need to carefully consider what to use. On many architectures,
+ * get_user_pages_fast simply falls back to get_user_pages.
+ */
+int __weak get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start, nr_pages,
+ write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff)
+{
+ unsigned long ret;
+ struct mm_struct *mm = current->mm;
+ unsigned long populate;
+
+ ret = security_mmap_file(file, prot, flag);
+ if (!ret) {
+ down_write(&mm->mmap_sem);
+ ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
+ &populate);
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(ret, populate);
+ }
+ return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long offset)
+{
+ if (unlikely(offset + PAGE_ALIGN(len) < offset))
+ return -EINVAL;
+ if (unlikely(offset & ~PAGE_MASK))
+ return -EINVAL;
+
+ return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+struct address_space *page_mapping(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+
+ /* This happens if someone calls flush_dcache_page on slab page */
+ if (unlikely(PageSlab(page)))
+ return NULL;
+
+ if (unlikely(PageSwapCache(page))) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ mapping = swap_address_space(entry);
+ } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+ mapping = NULL;
+ return mapping;
+}
+
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_kbytes = 0;
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_ratio = 0;
+ return ret;
+}
+
+/*
+ * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
+ */
+unsigned long vm_commit_limit(void)
+{
+ unsigned long allowed;
+
+ if (sysctl_overcommit_kbytes)
+ allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+ else
+ allowed = ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100);
+ allowed += total_swap_pages;
+
+ return allowed;
+}
+
+/**
+ * get_cmdline() - copy the cmdline value to a buffer.
+ * @task: the task whose cmdline value to copy.
+ * @buffer: the buffer to copy to.
+ * @buflen: the length of the buffer. Larger cmdline values are truncated
+ * to this length.
+ * Returns the size of the cmdline field copied. Note that the copy does
+ * not guarantee an ending NULL byte.
+ */
+int get_cmdline(struct task_struct *task, char *buffer, int buflen)
+{
+ int res = 0;
+ unsigned int len;
+ struct mm_struct *mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+ if (!mm->arg_end)
+ goto out_mm; /* Shh! No looking before we're done */
+
+ len = mm->arg_end - mm->arg_start;
+
+ if (len > buflen)
+ len = buflen;
+
+ res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+
+ /*
+ * If the nul at the end of args has been overwritten, then
+ * assume application is using setproctitle(3).
+ */
+ if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
+ len = strnlen(buffer, res);
+ if (len < res) {
+ res = len;
+ } else {
+ len = mm->env_end - mm->env_start;
+ if (len > buflen - res)
+ len = buflen - res;
+ res += access_process_vm(task, mm->env_start,
+ buffer+res, len, 0);
+ res = strnlen(buffer, res);
+ }
+ }
+out_mm:
+ mmput(mm);
+out:
+ return res;
+}
+
+/* Tracepoints definitions. */
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 00000000000..9f25af825de
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * Single threaded tasks need not iterate the entire
+ * list of process. We can avoid the flushing as well
+ * since the mm's seqnum was increased and don't have
+ * to worry about other threads' seqnum. Current's
+ * flush will occur upon the next lookup.
+ */
+ if (atomic_read(&mm->mm_users) == 1)
+ return;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * Only flush the vmacache pointers as the
+ * mm seqnum is already set and curr's will
+ * be set upon invalidation when the next
+ * lookup is done.
+ */
+ if (mm == p->mm)
+ vmacache_flush(p);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache_seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (!vma)
+ continue;
+ if (WARN_ON_ONCE(vma->vm_mm != mm))
+ break;
+ if (vma->vm_start <= addr && vma->vm_end > addr) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ count_vm_vmacache_event(VMACACHE_FIND_CALLS);
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end) {
+ count_vm_vmacache_event(VMACACHE_FIND_HITS);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1ac191ce564..f64632b6719 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,24 +8,52 @@
* Numa awareness, Christoph Lameter, SGI, June 2005
*/
+#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
+#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
-
-#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugobjects.h>
+#include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+#include <linux/pfn.h>
+#include <linux/kmemleak.h>
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/llist.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
+#include <asm/shmparam.h>
+struct vfree_deferred {
+ struct llist_head list;
+ struct work_struct wq;
+};
+static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+static void __vunmap(const void *, int);
+
+static void free_work(struct work_struct *w)
+{
+ struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
+ struct llist_node *llnode = llist_del_all(&p->list);
+ while (llnode) {
+ void *p = llnode;
+ llnode = llist_next(llnode);
+ __vunmap(p, 1);
+ }
+}
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node);
+/*** Page table manipulation functions ***/
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
{
@@ -38,8 +66,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
} while (pte++, addr += PAGE_SIZE, addr != end);
}
-static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
@@ -53,8 +80,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
} while (pmd++, addr = next, addr != end);
}
-static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
- unsigned long end)
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
@@ -68,46 +94,49 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
} while (pud++, addr = next, addr != end);
}
-void unmap_vm_area(struct vm_struct *area)
+static void vunmap_page_range(unsigned long addr, unsigned long end)
{
pgd_t *pgd;
unsigned long next;
- unsigned long addr = (unsigned long) area->addr;
- unsigned long end = addr + area->size;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
- flush_cache_vunmap(addr, end);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
vunmap_pud_range(pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range((unsigned long) area->addr, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page ***pages)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pte_t *pte;
+ /*
+ * nr is a running index into the array which helps higher level
+ * callers keep track of where we're up to.
+ */
+
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
- struct page *page = **pages;
- WARN_ON(!pte_none(*pte));
- if (!page)
+ struct page *page = pages[*nr];
+
+ if (WARN_ON(!pte_none(*pte)))
+ return -EBUSY;
+ if (WARN_ON(!page))
return -ENOMEM;
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
- (*pages)++;
+ (*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
return 0;
}
-static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pmd_t *pmd;
unsigned long next;
@@ -117,14 +146,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages))
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr)
{
pud_t *pud;
unsigned long next;
@@ -134,211 +163,1305 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages))
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+/*
+ * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
+ * will have pfns corresponding to the "pages" array.
+ *
+ * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ */
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
- unsigned long addr = (unsigned long) area->addr;
- unsigned long end = addr + area->size - PAGE_SIZE;
- int err;
+ unsigned long addr = start;
+ int err = 0;
+ int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_pud_range(pgd, addr, next, prot, pages);
+ err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
if (err)
- break;
+ return err;
} while (pgd++, addr = next, addr != end);
- flush_cache_vmap((unsigned long) area->addr, end);
- return err;
+
+ return nr;
}
-struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end, int node)
+static int vmap_page_range(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
- struct vm_struct **p, *tmp, *area;
- unsigned long align = 1;
- unsigned long addr;
+ int ret;
+
+ ret = vmap_page_range_noflush(start, end, prot, pages);
+ flush_cache_vmap(start, end);
+ return ret;
+}
- if (flags & VM_IOREMAP) {
- int bit = fls(size);
+int is_vmalloc_or_module_addr(const void *x)
+{
+ /*
+ * ARM, x86-64 and sparc64 put modules in a special place,
+ * and fall back on vmalloc() if that fails. Others
+ * just put it in the vmalloc space.
+ */
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+ unsigned long addr = (unsigned long)x;
+ if (addr >= MODULES_VADDR && addr < MODULES_END)
+ return 1;
+#endif
+ return is_vmalloc_addr(x);
+}
- if (bit > IOREMAP_MAX_ORDER)
- bit = IOREMAP_MAX_ORDER;
- else if (bit < PAGE_SHIFT)
- bit = PAGE_SHIFT;
+/*
+ * Walk a vmap address to the struct page it maps.
+ */
+struct page *vmalloc_to_page(const void *vmalloc_addr)
+{
+ unsigned long addr = (unsigned long) vmalloc_addr;
+ struct page *page = NULL;
+ pgd_t *pgd = pgd_offset_k(addr);
- align = 1ul << bit;
+ /*
+ * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+ * architectures that do not vmalloc module space
+ */
+ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
+
+ if (!pgd_none(*pgd)) {
+ pud_t *pud = pud_offset(pgd, addr);
+ if (!pud_none(*pud)) {
+ pmd_t *pmd = pmd_offset(pud, addr);
+ if (!pmd_none(*pmd)) {
+ pte_t *ptep, pte;
+
+ ptep = pte_offset_map(pmd, addr);
+ pte = *ptep;
+ if (pte_present(pte))
+ page = pte_page(pte);
+ pte_unmap(ptep);
+ }
+ }
}
- addr = ALIGN(start, align);
- size = PAGE_ALIGN(size);
+ return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
- area = kmalloc_node(sizeof(*area), GFP_KERNEL, node);
- if (unlikely(!area))
- return NULL;
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+{
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
- if (unlikely(!size)) {
- kfree (area);
- return NULL;
+
+/*** Global kva allocator ***/
+
+#define VM_LAZY_FREE 0x01
+#define VM_LAZY_FREEING 0x02
+#define VM_VM_AREA 0x04
+
+static DEFINE_SPINLOCK(vmap_area_lock);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+
+/* The vmap cache globals are protected by vmap_area_lock */
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_vstart;
+static unsigned long cached_align;
+
+static unsigned long vmap_area_pcpu_hole;
+
+static struct vmap_area *__find_vmap_area(unsigned long addr)
+{
+ struct rb_node *n = vmap_area_root.rb_node;
+
+ while (n) {
+ struct vmap_area *va;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (addr < va->va_start)
+ n = n->rb_left;
+ else if (addr >= va->va_end)
+ n = n->rb_right;
+ else
+ return va;
+ }
+
+ return NULL;
+}
+
+static void __insert_vmap_area(struct vmap_area *va)
+{
+ struct rb_node **p = &vmap_area_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_node *tmp;
+
+ while (*p) {
+ struct vmap_area *tmp_va;
+
+ parent = *p;
+ tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+ if (va->va_start < tmp_va->va_end)
+ p = &(*p)->rb_left;
+ else if (va->va_end > tmp_va->va_start)
+ p = &(*p)->rb_right;
+ else
+ BUG();
}
+ rb_link_node(&va->rb_node, parent, p);
+ rb_insert_color(&va->rb_node, &vmap_area_root);
+
+ /* address-sort this list */
+ tmp = rb_prev(&va->rb_node);
+ if (tmp) {
+ struct vmap_area *prev;
+ prev = rb_entry(tmp, struct vmap_area, rb_node);
+ list_add_rcu(&va->list, &prev->list);
+ } else
+ list_add_rcu(&va->list, &vmap_area_list);
+}
+
+static void purge_vmap_area_lazy(void);
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+ unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ int node, gfp_t gfp_mask)
+{
+ struct vmap_area *va;
+ struct rb_node *n;
+ unsigned long addr;
+ int purged = 0;
+ struct vmap_area *first;
+
+ BUG_ON(!size);
+ BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(!is_power_of_2(align));
+
+ va = kmalloc_node(sizeof(struct vmap_area),
+ gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
/*
- * We always allocate a guard page.
+ * Only scan the relevant parts containing pointers to other objects
+ * to avoid false negatives.
*/
- size += PAGE_SIZE;
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
- if ((unsigned long)tmp->addr < addr) {
- if((unsigned long)tmp->addr + tmp->size >= addr)
- addr = ALIGN(tmp->size +
- (unsigned long)tmp->addr, align);
- continue;
+retry:
+ spin_lock(&vmap_area_lock);
+ /*
+ * Invalidate cache if we have more permissive parameters.
+ * cached_hole_size notes the largest hole noticed _below_
+ * the vmap_area cached in free_vmap_cache: if size fits
+ * into that hole, we want to scan from vstart to reuse
+ * the hole instead of allocating above free_vmap_cache.
+ * Note that __free_vmap_area may update free_vmap_cache
+ * without updating cached_hole_size or cached_align.
+ */
+ if (!free_vmap_cache ||
+ size < cached_hole_size ||
+ vstart < cached_vstart ||
+ align < cached_align) {
+nocache:
+ cached_hole_size = 0;
+ free_vmap_cache = NULL;
+ }
+ /* record if we encounter less permissive parameters */
+ cached_vstart = vstart;
+ cached_align = align;
+
+ /* find starting point for our search */
+ if (free_vmap_cache) {
+ first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ addr = ALIGN(first->va_end, align);
+ if (addr < vstart)
+ goto nocache;
+ if (addr + size < addr)
+ goto overflow;
+
+ } else {
+ addr = ALIGN(vstart, align);
+ if (addr + size < addr)
+ goto overflow;
+
+ n = vmap_area_root.rb_node;
+ first = NULL;
+
+ while (n) {
+ struct vmap_area *tmp;
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_end >= addr) {
+ first = tmp;
+ if (tmp->va_start <= addr)
+ break;
+ n = n->rb_left;
+ } else
+ n = n->rb_right;
}
- if ((size + addr) < addr)
- goto out;
- if (size + addr <= (unsigned long)tmp->addr)
+
+ if (!first)
goto found;
- addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
- if (addr > end - size)
- goto out;
}
-found:
- area->next = *p;
- *p = area;
+ /* from the starting point, walk areas until a suitable hole is found */
+ while (addr + size > first->va_start && addr + size <= vend) {
+ if (addr + cached_hole_size < first->va_start)
+ cached_hole_size = first->va_start - addr;
+ addr = ALIGN(first->va_end, align);
+ if (addr + size < addr)
+ goto overflow;
- area->flags = flags;
- area->addr = (void *)addr;
- area->size = size;
- area->pages = NULL;
- area->nr_pages = 0;
- area->phys_addr = 0;
- write_unlock(&vmlist_lock);
+ if (list_is_last(&first->list, &vmap_area_list))
+ goto found;
- return area;
+ first = list_entry(first->list.next,
+ struct vmap_area, list);
+ }
-out:
- write_unlock(&vmlist_lock);
- kfree(area);
+found:
+ if (addr + size > vend)
+ goto overflow;
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->flags = 0;
+ __insert_vmap_area(va);
+ free_vmap_cache = &va->rb_node;
+ spin_unlock(&vmap_area_lock);
+
+ BUG_ON(va->va_start & (align-1));
+ BUG_ON(va->va_start < vstart);
+ BUG_ON(va->va_end > vend);
+
+ return va;
+
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = 1;
+ goto retry;
+ }
if (printk_ratelimit())
- printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
- return NULL;
+ printk(KERN_WARNING
+ "vmap allocation for size %lu failed: "
+ "use vmalloc=<size> to increase size.\n", size);
+ kfree(va);
+ return ERR_PTR(-EBUSY);
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
+static void __free_vmap_area(struct vmap_area *va)
+{
+ BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+
+ if (free_vmap_cache) {
+ if (va->va_end < cached_vstart) {
+ free_vmap_cache = NULL;
+ } else {
+ struct vmap_area *cache;
+ cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ if (va->va_start <= cache->va_start) {
+ free_vmap_cache = rb_prev(&va->rb_node);
+ /*
+ * We don't try to update cached_hole_size or
+ * cached_align, but it won't go very wrong.
+ */
+ }
+ }
+ }
+ rb_erase(&va->rb_node, &vmap_area_root);
+ RB_CLEAR_NODE(&va->rb_node);
+ list_del_rcu(&va->list);
+
+ /*
+ * Track the highest possible candidate for pcpu area
+ * allocation. Areas outside of vmalloc area can be returned
+ * here too, consider only end addresses which fall inside
+ * vmalloc area proper.
+ */
+ if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
+ vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+
+ kfree_rcu(va, rcu_head);
+}
+
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ __free_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+}
+
+/*
+ * Clear the pagetable entries of a given vmap_area
+ */
+static void unmap_vmap_area(struct vmap_area *va)
+{
+ vunmap_page_range(va->va_start, va->va_end);
+}
+
+static void vmap_debug_free_range(unsigned long start, unsigned long end)
+{
+ /*
+ * Unmap page tables and force a TLB flush immediately if
+ * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
+ * bugs similarly to those in linear kernel virtual address
+ * space after a page has been freed.
+ *
+ * All the lazy freeing logic is still retained, in order to
+ * minimise intrusiveness of this debugging feature.
+ *
+ * This is going to be *slow* (linear kernel virtual address
+ * debugging doesn't do a broadcast TLB flush so it is a lot
+ * faster).
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+#endif
+}
+
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+ unsigned int log;
+
+ log = fls(num_online_cpus());
+
+ return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+
+static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
+
+/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
+ * Purges all lazily-freed vmap areas.
+ *
+ * If sync is 0 then don't purge if there is already a purge in progress.
+ * If force_flush is 1, then flush kernel TLBs between *start and *end even
+ * if we found no lazy vmap areas to unmap (callers can use this to optimise
+ * their own TLB flushing).
+ * Returns with *start = min(*start, lowest purged address)
+ * *end = max(*end, highest purged address)
+ */
+static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+ int sync, int force_flush)
+{
+ static DEFINE_SPINLOCK(purge_lock);
+ LIST_HEAD(valist);
+ struct vmap_area *va;
+ struct vmap_area *n_va;
+ int nr = 0;
+
+ /*
+ * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
+ * should not expect such behaviour. This just simplifies locking for
+ * the case that isn't actually used at the moment anyway.
+ */
+ if (!sync && !force_flush) {
+ if (!spin_trylock(&purge_lock))
+ return;
+ } else
+ spin_lock(&purge_lock);
+
+ if (sync)
+ purge_fragmented_blocks_allcpus();
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(va, &vmap_area_list, list) {
+ if (va->flags & VM_LAZY_FREE) {
+ if (va->va_start < *start)
+ *start = va->va_start;
+ if (va->va_end > *end)
+ *end = va->va_end;
+ nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+ list_add_tail(&va->purge_list, &valist);
+ va->flags |= VM_LAZY_FREEING;
+ va->flags &= ~VM_LAZY_FREE;
+ }
+ }
+ rcu_read_unlock();
+
+ if (nr)
+ atomic_sub(nr, &vmap_lazy_nr);
+
+ if (nr || force_flush)
+ flush_tlb_kernel_range(*start, *end);
+
+ if (nr) {
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry_safe(va, n_va, &valist, purge_list)
+ __free_vmap_area(va);
+ spin_unlock(&vmap_area_lock);
+ }
+ spin_unlock(&purge_lock);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+
+ __purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+
+ __purge_vmap_area_lazy(&start, &end, 1, 0);
+}
+
+/*
+ * Free a vmap area, caller ensuring that the area has been unmapped
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
+ */
+static void free_vmap_area_noflush(struct vmap_area *va)
+{
+ va->flags |= VM_LAZY_FREE;
+ atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
+ if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
+}
+
+/*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
+}
+
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+ flush_cache_vunmap(va->va_start, va->va_end);
+ free_unmap_vmap_area_noflush(va);
+}
+
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
+static void free_unmap_vmap_area_addr(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
+}
+
+
+/*** Per cpu kva allocator ***/
+
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE (128UL*1024*1024)
+#else
+#define VMALLOC_SPACE (128UL*1024*1024*1024)
+#endif
+
+#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS \
+ VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
+ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
+ VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
+
+#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
+
+static bool vmap_initialized __read_mostly = false;
+
+struct vmap_block_queue {
+ spinlock_t lock;
+ struct list_head free;
+};
+
+struct vmap_block {
+ spinlock_t lock;
+ struct vmap_area *va;
+ unsigned long free, dirty;
+ DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ struct list_head free_list;
+ struct rcu_head rcu_head;
+ struct list_head purge;
+};
+
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+
+/*
+ * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_SPINLOCK(vmap_block_tree_lock);
+static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+ addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+ addr /= VMAP_BLOCK_SIZE;
+ return addr;
+}
+
+static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ struct vmap_area *va;
+ unsigned long vb_idx;
+ int node, err;
+
+ node = numa_node_id();
+
+ vb = kmalloc_node(sizeof(struct vmap_block),
+ gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!vb))
+ return ERR_PTR(-ENOMEM);
+
+ va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+ VMALLOC_START, VMALLOC_END,
+ node, gfp_mask);
+ if (IS_ERR(va)) {
+ kfree(vb);
+ return ERR_CAST(va);
+ }
+
+ err = radix_tree_preload(gfp_mask);
+ if (unlikely(err)) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
+
+ spin_lock_init(&vb->lock);
+ vb->va = va;
+ vb->free = VMAP_BBMAP_BITS;
+ vb->dirty = 0;
+ bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ INIT_LIST_HEAD(&vb->free_list);
+
+ vb_idx = addr_to_vb_idx(va->va_start);
+ spin_lock(&vmap_block_tree_lock);
+ err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
+ spin_unlock(&vmap_block_tree_lock);
+ BUG_ON(err);
+ radix_tree_preload_end();
+
+ vbq = &get_cpu_var(vmap_block_queue);
+ spin_lock(&vbq->lock);
+ list_add_rcu(&vb->free_list, &vbq->free);
+ spin_unlock(&vbq->lock);
+ put_cpu_var(vmap_block_queue);
+
+ return vb;
+}
+
+static void free_vmap_block(struct vmap_block *vb)
+{
+ struct vmap_block *tmp;
+ unsigned long vb_idx;
+
+ vb_idx = addr_to_vb_idx(vb->va->va_start);
+ spin_lock(&vmap_block_tree_lock);
+ tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
+ spin_unlock(&vmap_block_tree_lock);
+ BUG_ON(tmp != vb);
+
+ free_vmap_area_noflush(vb->va);
+ kfree_rcu(vb, rcu_head);
+}
+
+static void purge_fragmented_blocks(int cpu)
+{
+ LIST_HEAD(purge);
+ struct vmap_block *vb;
+ struct vmap_block *n_vb;
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+
+ if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ continue;
+
+ spin_lock(&vb->lock);
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+ vb->free = 0; /* prevent further allocs after releasing lock */
+ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ spin_unlock(&vb->lock);
+ list_add_tail(&vb->purge, &purge);
+ } else
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
+static void purge_fragmented_blocks_allcpus(void)
{
- return __get_vm_area_node(size, flags, start, end, -1);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ purge_fragmented_blocks(cpu);
+}
+
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+ struct vmap_block_queue *vbq;
+ struct vmap_block *vb;
+ unsigned long addr = 0;
+ unsigned int order;
+
+ BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+ if (WARN_ON(size == 0)) {
+ /*
+ * Allocating 0 bytes isn't what caller wants since
+ * get_order(0) returns funny result. Just warn and terminate
+ * early.
+ */
+ return NULL;
+ }
+ order = get_order(size);
+
+again:
+ rcu_read_lock();
+ vbq = &get_cpu_var(vmap_block_queue);
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ int i;
+
+ spin_lock(&vb->lock);
+ if (vb->free < 1UL << order)
+ goto next;
+
+ i = VMAP_BBMAP_BITS - vb->free;
+ addr = vb->va->va_start + (i << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) !=
+ addr_to_vb_idx(vb->va->va_start));
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ }
+ spin_unlock(&vb->lock);
+ break;
+next:
+ spin_unlock(&vb->lock);
+ }
+
+ put_cpu_var(vmap_block_queue);
+ rcu_read_unlock();
+
+ if (!addr) {
+ vb = new_vmap_block(gfp_mask);
+ if (IS_ERR(vb))
+ return vb;
+ goto again;
+ }
+
+ return (void *)addr;
+}
+
+static void vb_free(const void *addr, unsigned long size)
+{
+ unsigned long offset;
+ unsigned long vb_idx;
+ unsigned int order;
+ struct vmap_block *vb;
+
+ BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+
+ flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+
+ order = get_order(size);
+
+ offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+
+ vb_idx = addr_to_vb_idx((unsigned long)addr);
+ rcu_read_lock();
+ vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
+ rcu_read_unlock();
+ BUG_ON(!vb);
+
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
+ spin_lock(&vb->lock);
+ BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+
+ vb->dirty += 1UL << order;
+ if (vb->dirty == VMAP_BBMAP_BITS) {
+ BUG_ON(vb->free);
+ spin_unlock(&vb->lock);
+ free_vmap_block(vb);
+ } else
+ spin_unlock(&vb->lock);
}
/**
- * get_vm_area - reserve a contingous kernel virtual area
- * @size: size of the area
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
*
- * Search an area of @size in the kernel virtual mapping area,
- * and reserved it for out purposes. Returns the area descriptor
- * on success or %NULL on failure.
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
*/
-struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+void vm_unmap_aliases(void)
{
- return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+ unsigned long start = ULONG_MAX, end = 0;
+ int cpu;
+ int flush = 0;
+
+ if (unlikely(!vmap_initialized))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+ struct vmap_block *vb;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+ int i, j;
+
+ spin_lock(&vb->lock);
+ i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+ if (i < VMAP_BBMAP_BITS) {
+ unsigned long s, e;
+
+ j = find_last_bit(vb->dirty_map,
+ VMAP_BBMAP_BITS);
+ j = j + 1; /* need exclusive index */
+
+ s = vb->va->va_start + (i << PAGE_SHIFT);
+ e = vb->va->va_start + (j << PAGE_SHIFT);
+ flush = 1;
+
+ if (s < start)
+ start = s;
+ if (e > end)
+ end = e;
+ }
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+ }
+
+ __purge_vmap_area_lazy(&start, &end, 1, flush);
}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node)
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
{
- return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+ unsigned long size = count << PAGE_SHIFT;
+ unsigned long addr = (unsigned long)mem;
+
+ BUG_ON(!addr);
+ BUG_ON(addr < VMALLOC_START);
+ BUG_ON(addr > VMALLOC_END);
+ BUG_ON(addr & (PAGE_SIZE-1));
+
+ debug_check_no_locks_freed(mem, size);
+ vmap_debug_free_range(addr, addr+size);
+
+ if (likely(count <= VMAP_MAX_ALLOC))
+ vb_free(mem, size);
+ else
+ free_unmap_vmap_area_addr(addr);
}
+EXPORT_SYMBOL(vm_unmap_ram);
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(void *addr)
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ *
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good. But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine). You could see failures in
+ * the end. Please use this function for short-lived objects.
+ *
+ * Returns: a pointer to the address that has been mapped, or %NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
{
- struct vm_struct *tmp;
+ unsigned long size = count << PAGE_SHIFT;
+ unsigned long addr;
+ void *mem;
+
+ if (likely(count <= VMAP_MAX_ALLOC)) {
+ mem = vb_alloc(size, GFP_KERNEL);
+ if (IS_ERR(mem))
+ return NULL;
+ addr = (unsigned long)mem;
+ } else {
+ struct vmap_area *va;
+ va = alloc_vmap_area(size, PAGE_SIZE,
+ VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ if (IS_ERR(va))
+ return NULL;
+
+ addr = va->va_start;
+ mem = (void *)addr;
+ }
+ if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ vm_unmap_ram(mem, count);
+ return NULL;
+ }
+ return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
- for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
- if (tmp->addr == addr)
+static struct vm_struct *vmlist __initdata;
+/**
+ * vm_area_add_early - add vmap area early during boot
+ * @vm: vm_struct to add
+ *
+ * This function is used to add fixed kernel vm area to vmlist before
+ * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
+ * should contain proper values and the other fields should be zero.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_add_early(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+
+ BUG_ON(vmap_initialized);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr) {
+ BUG_ON(tmp->addr < vm->addr + vm->size);
break;
+ } else
+ BUG_ON(tmp->addr + tmp->size > vm->addr);
}
+ vm->next = *p;
+ *p = vm;
+}
+
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @align: requested alignment
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called. @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero. On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
+{
+ static size_t vm_init_off __initdata;
+ unsigned long addr;
- return tmp;
+ addr = ALIGN(VMALLOC_START + vm_init_off, align);
+ vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+
+ vm->addr = (void *)addr;
+
+ vm_area_add_early(vm);
}
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(void *addr)
+void __init vmalloc_init(void)
{
- struct vm_struct **p, *tmp;
+ struct vmap_area *va;
+ struct vm_struct *tmp;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct vmap_block_queue *vbq;
+ struct vfree_deferred *p;
+
+ vbq = &per_cpu(vmap_block_queue, i);
+ spin_lock_init(&vbq->lock);
+ INIT_LIST_HEAD(&vbq->free);
+ p = &per_cpu(vfree_deferred, i);
+ init_llist_head(&p->list);
+ INIT_WORK(&p->wq, free_work);
+ }
- for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
- if (tmp->addr == addr)
- goto found;
+ /* Import existing vmlist entries. */
+ for (tmp = vmlist; tmp; tmp = tmp->next) {
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ va->flags = VM_VM_AREA;
+ va->va_start = (unsigned long)tmp->addr;
+ va->va_end = va->va_start + tmp->size;
+ va->vm = tmp;
+ __insert_vmap_area(va);
}
- return NULL;
-found:
- unmap_vm_area(tmp);
- *p = tmp->next;
+ vmap_area_pcpu_hole = VMALLOC_END;
+
+ vmap_initialized = true;
+}
+
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
+{
+ return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+ vunmap_page_range(addr, addr + size);
+}
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+ unsigned long end = addr + size;
+
+ flush_cache_vunmap(addr, end);
+ vunmap_page_range(addr, end);
+ flush_tlb_kernel_range(addr, end);
+}
+EXPORT_SYMBOL_GPL(unmap_kernel_range);
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long end = addr + get_vm_area_size(area);
+ int err;
+
+ err = vmap_page_range(addr, end, prot, *pages);
+ if (err > 0) {
+ *pages += err;
+ err = 0;
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(map_vm_area);
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->vm = vm;
+ va->flags |= VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
+}
+
+static void clear_vm_uninitialized_flag(struct vm_struct *vm)
+{
/*
- * Remove the guard page.
+ * Before removing VM_UNINITIALIZED,
+ * we should make sure that vm has proper values.
+ * Pair with smp_rmb() in show_numa_info().
*/
- tmp->size -= PAGE_SIZE;
- return tmp;
+ smp_wmb();
+ vm->flags &= ~VM_UNINITIALIZED;
+}
+
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+ unsigned long align, unsigned long flags, unsigned long start,
+ unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+{
+ struct vmap_area *va;
+ struct vm_struct *area;
+
+ BUG_ON(in_interrupt());
+ if (flags & VM_IOREMAP)
+ align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+
+ size = PAGE_ALIGN(size);
+ if (unlikely(!size))
+ return NULL;
+
+ area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
+ if (unlikely(!area))
+ return NULL;
+
+ /*
+ * We always allocate a guard page.
+ */
+ size += PAGE_SIZE;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ if (IS_ERR(va)) {
+ kfree(area);
+ return NULL;
+ }
+
+ setup_vmalloc_vm(area, va, flags, caller);
+
+ return area;
+}
+
+struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end)
+{
+ return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
+ GFP_KERNEL, __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(__get_vm_area);
+
+struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end,
+ const void *caller)
+{
+ return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
+ GFP_KERNEL, caller);
}
/**
- * remove_vm_area - find and remove a contingous kernel virtual area
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ *
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ */
+struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
+{
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL,
+ __builtin_return_address(0));
+}
+
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+ const void *caller)
+{
+ return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
+}
+
+/**
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
+{
+ struct vmap_area *va;
+
+ va = find_vmap_area((unsigned long)addr);
+ if (va && va->flags & VM_VM_AREA)
+ return va->vm;
+
+ return NULL;
+}
+
+/**
+ * remove_vm_area - find and remove a continuous kernel virtual area
* @addr: base address
*
* Search for the kernel VM area starting at @addr, and remove it.
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*/
-struct vm_struct *remove_vm_area(void *addr)
+struct vm_struct *remove_vm_area(const void *addr)
{
- struct vm_struct *v;
- write_lock(&vmlist_lock);
- v = __remove_vm_area(addr);
- write_unlock(&vmlist_lock);
- return v;
+ struct vmap_area *va;
+
+ va = find_vmap_area((unsigned long)addr);
+ if (va && va->flags & VM_VM_AREA) {
+ struct vm_struct *vm = va->vm;
+
+ spin_lock(&vmap_area_lock);
+ va->vm = NULL;
+ va->flags &= ~VM_VM_AREA;
+ spin_unlock(&vmap_area_lock);
+
+ vmap_debug_free_range(va->va_start, va->va_end);
+ free_unmap_vmap_area(va);
+ vm->size -= PAGE_SIZE;
+
+ return vm;
+ }
+ return NULL;
}
-void __vunmap(void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
if (!addr)
return;
- if ((PAGE_SIZE-1) & (unsigned long)addr) {
- printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
- WARN_ON(1);
+ if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+ addr))
return;
- }
area = remove_vm_area(addr);
if (unlikely(!area)) {
- printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+ WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
- WARN_ON(1);
return;
}
debug_check_no_locks_freed(addr, area->size);
+ debug_check_no_obj_freed(addr, area->size);
if (deallocate_pages) {
int i;
for (i = 0; i < area->nr_pages; i++) {
- BUG_ON(!area->pages[i]);
- __free_page(area->pages[i]);
+ struct page *page = area->pages[i];
+
+ BUG_ON(!page);
+ __free_page(page);
}
if (area->flags & VM_VPAGES)
@@ -350,21 +1473,35 @@ void __vunmap(void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
*
- * Free the virtually contiguous memory area starting at @addr, as
+ * Free the virtually continuous memory area starting at @addr, as
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
* NULL, no operation is performed.
*
- * Must not be called in interrupt context.
+ * Must not be called in NMI context (strictly speaking, only if we don't
+ * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea)
+ *
+ * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
*/
-void vfree(void *addr)
+void vfree(const void *addr)
{
- BUG_ON(in_interrupt());
- __vunmap(addr, 1);
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ if (unlikely(in_interrupt())) {
+ struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+ } else
+ __vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
@@ -377,10 +1514,12 @@ EXPORT_SYMBOL(vfree);
*
* Must not be called in interrupt context.
*/
-void vunmap(void *addr)
+void vunmap(const void *addr)
{
BUG_ON(in_interrupt());
- __vunmap(addr, 0);
+ might_sleep();
+ if (addr)
+ __vunmap(addr, 0);
}
EXPORT_SYMBOL(vunmap);
@@ -399,12 +1538,16 @@ void *vmap(struct page **pages, unsigned int count,
{
struct vm_struct *area;
- if (count > num_physpages)
+ might_sleep();
+
+ if (count > totalram_pages)
return NULL;
- area = get_vm_area((count << PAGE_SHIFT), flags);
+ area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+ __builtin_return_address(0));
if (!area)
return NULL;
+
if (map_vm_area(area, prot, &pages)) {
vunmap(area->addr);
return NULL;
@@ -414,40 +1557,51 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, const void *caller);
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot, int node)
{
+ const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
+ gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+ nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node);
+ pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
+ PAGE_KERNEL, node, area->caller);
area->flags |= VM_VPAGES;
- } else
- pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node);
+ } else {
+ pages = kmalloc_node(array_size, nested_gfp, node);
+ }
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
- memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) {
- if (node < 0)
- area->pages[i] = alloc_page(gfp_mask);
+ struct page *page;
+ gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
+
+ if (node == NUMA_NO_NODE)
+ page = alloc_page(tmp_mask);
else
- area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
- if (unlikely(!area->pages[i])) {
+ page = alloc_pages_node(node, tmp_mask, order);
+
+ if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
+ area->pages[i] = page;
}
if (map_vm_area(area, prot, &pages))
@@ -455,64 +1609,141 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
+ warn_alloc_failed(gfp_mask, order,
+ "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
+ (area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
- return __vmalloc_area_node(area, gfp_mask, prot, -1);
-}
-
/**
- * __vmalloc_node - allocate virtually contiguous memory
+ * __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
- * @node: node to use for allocation or -1
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
- int node)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, const void *caller)
{
struct vm_struct *area;
+ void *addr;
+ unsigned long real_size = size;
size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > num_physpages)
- return NULL;
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages)
+ goto fail;
- area = get_vm_area_node(size, VM_ALLOC, node);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
+ start, end, node, gfp_mask, caller);
if (!area)
+ goto fail;
+
+ addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ if (!addr)
return NULL;
- return __vmalloc_area_node(area, gfp_mask, prot, node);
+ /*
+ * In this function, newly allocated vm_struct has VM_UNINITIALIZED
+ * flag. It means that vm_struct is not fully initialized.
+ * Now, it is fully initialized, so remove this flag here.
+ */
+ clear_vm_uninitialized_flag(area);
+
+ /*
+ * A ref_count = 2 is needed because vm_struct allocated in
+ * __get_vm_area_node() contains a reference to the virtual address of
+ * the vmalloc'ed block.
+ */
+ kmemleak_alloc(addr, real_size, 2, gfp_mask);
+
+ return addr;
+
+fail:
+ warn_alloc_failed(gfp_mask, 0,
+ "vmalloc: allocation failure: %lu bytes\n",
+ real_size);
+ return NULL;
+}
+
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, const void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, prot, node, caller);
}
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
- return __vmalloc_node(size, gfp_mask, prot, -1);
+ return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+ int node, gfp_t flags)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc_node_flags(size, NUMA_NO_NODE,
+ GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
/**
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc_node_flags(size, NUMA_NO_NODE,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
@@ -524,12 +1755,14 @@ void *vmalloc_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
- write_lock(&vmlist_lock);
- area = __find_vm_area(ret);
- area->flags |= VM_USERMAP;
- write_unlock(&vmlist_lock);
-
+ ret = __vmalloc_node(size, SHMLBA,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (ret) {
+ area = find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ }
return ret;
}
EXPORT_SYMBOL(vmalloc_user);
@@ -542,15 +1775,35 @@ EXPORT_SYMBOL(vmalloc_user);
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node_flags(size, node,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -563,15 +1816,24 @@ EXPORT_SYMBOL(vmalloc_node);
* the page level allocator and map them into contiguous and
* executable kernel virtual space.
*
- * For tight cotrol over page level allocator and protection flags
+ * For tight control over page level allocator and protection flags
* use __vmalloc() instead.
*/
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ NUMA_NO_NODE, __builtin_return_address(0));
}
+#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
+#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
+#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#else
+#define GFP_VMALLOC32 GFP_KERNEL
+#endif
+
/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
@@ -581,7 +1843,8 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
+ NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -597,30 +1860,146 @@ void *vmalloc_32_user(unsigned long size)
struct vm_struct *area;
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
- write_lock(&vmlist_lock);
- area = __find_vm_area(ret);
- area->flags |= VM_USERMAP;
- write_unlock(&vmlist_lock);
-
+ ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (ret) {
+ area = find_vm_area(ret);
+ area->flags |= VM_USERMAP;
+ }
return ret;
}
EXPORT_SYMBOL(vmalloc_32_user);
+/*
+ * small helper routine , copy contents to buf from addr.
+ * If the page is not present, fill zero.
+ */
+
+static int aligned_vread(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = (unsigned long)addr & ~PAGE_MASK;
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p);
+ memcpy(buf, map + offset, length);
+ kunmap_atomic(map);
+ } else
+ memset(buf, 0, length);
+
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+static int aligned_vwrite(char *buf, char *addr, unsigned long count)
+{
+ struct page *p;
+ int copied = 0;
+
+ while (count) {
+ unsigned long offset, length;
+
+ offset = (unsigned long)addr & ~PAGE_MASK;
+ length = PAGE_SIZE - offset;
+ if (length > count)
+ length = count;
+ p = vmalloc_to_page(addr);
+ /*
+ * To do safe access to this _mapped_ area, we need
+ * lock. But adding lock here means that we need to add
+ * overhead of vmalloc()/vfree() calles for this _debug_
+ * interface, rarely used. Instead of that, we'll use
+ * kmap() and get small overhead in this access function.
+ */
+ if (p) {
+ /*
+ * we can expect USER0 is not used (see vread/vwrite's
+ * function description)
+ */
+ void *map = kmap_atomic(p);
+ memcpy(map + offset, buf, length);
+ kunmap_atomic(map);
+ }
+ addr += length;
+ buf += length;
+ copied += length;
+ count -= length;
+ }
+ return copied;
+}
+
+/**
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * Returns # of bytes which addr and buf should be increased.
+ * (same number to @count). Returns 0 if [addr...addr+count) doesn't
+ * includes any intersect with alive vmalloc area.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ *
+ */
+
long vread(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
+ struct vmap_area *va;
+ struct vm_struct *vm;
char *vaddr, *buf_start = buf;
+ unsigned long buflen = count;
unsigned long n;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
- read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + get_vm_area_size(vm))
continue;
while (addr < vaddr) {
if (count == 0)
@@ -630,35 +2009,79 @@ long vread(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *buf = *addr;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
+ n = vaddr + get_vm_area_size(vm) - addr;
+ if (n > count)
+ n = count;
+ if (!(vm->flags & VM_IOREMAP))
+ aligned_vread(buf, addr, n);
+ else /* IOREMAP area is treated as memory hole */
+ memset(buf, 0, n);
+ buf += n;
+ addr += n;
+ count -= n;
}
finished:
- read_unlock(&vmlist_lock);
- return buf - buf_start;
+ spin_unlock(&vmap_area_lock);
+
+ if (buf == buf_start)
+ return 0;
+ /* zero-fill memory holes */
+ if (buf != buf_start + buflen)
+ memset(buf, 0, buflen - (buf - buf_start));
+
+ return buflen;
}
+/**
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
+ *
+ * Returns # of bytes which addr and buf should be incresed.
+ * (same number to @count).
+ * If [addr...addr+count) doesn't includes any intersect with valid
+ * vmalloc area, returns 0.
+ *
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
+ *
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
+ *
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any informaion, as /dev/kmem.
+ */
+
long vwrite(char *buf, char *addr, unsigned long count)
{
- struct vm_struct *tmp;
- char *vaddr, *buf_start = buf;
- unsigned long n;
+ struct vmap_area *va;
+ struct vm_struct *vm;
+ char *vaddr;
+ unsigned long n, buflen;
+ int copied = 0;
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
+ buflen = count;
- read_lock(&vmlist_lock);
- for (tmp = vmlist; tmp; tmp = tmp->next) {
- vaddr = (char *) tmp->addr;
- if (addr >= vaddr + tmp->size - PAGE_SIZE)
+ spin_lock(&vmap_area_lock);
+ list_for_each_entry(va, &vmap_area_list, list) {
+ if (!count)
+ break;
+
+ if (!(va->flags & VM_VM_AREA))
+ continue;
+
+ vm = va->vm;
+ vaddr = (char *) vm->addr;
+ if (addr >= vaddr + get_vm_area_size(vm))
continue;
while (addr < vaddr) {
if (count == 0)
@@ -667,77 +2090,641 @@ long vwrite(char *buf, char *addr, unsigned long count)
addr++;
count--;
}
- n = vaddr + tmp->size - PAGE_SIZE - addr;
- do {
- if (count == 0)
- goto finished;
- *addr = *buf;
- buf++;
- addr++;
- count--;
- } while (--n > 0);
+ n = vaddr + get_vm_area_size(vm) - addr;
+ if (n > count)
+ n = count;
+ if (!(vm->flags & VM_IOREMAP)) {
+ aligned_vwrite(buf, addr, n);
+ copied++;
+ }
+ buf += n;
+ addr += n;
+ count -= n;
}
finished:
- read_unlock(&vmlist_lock);
- return buf - buf_start;
+ spin_unlock(&vmap_area_lock);
+ if (!copied)
+ return 0;
+ return buflen;
}
/**
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @size: size of map area
+ *
+ * Returns: 0 for success, -Exxx on failure
+ *
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
+ *
+ * Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
+ void *kaddr, unsigned long size)
+{
+ struct vm_struct *area;
+
+ size = PAGE_ALIGN(size);
+
+ if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
+ return -EINVAL;
+
+ area = find_vm_area(kaddr);
+ if (!area)
+ return -EINVAL;
+
+ if (!(area->flags & VM_USERMAP))
+ return -EINVAL;
+
+ if (kaddr + size > area->addr + area->size)
+ return -EINVAL;
+
+ do {
+ struct page *page = vmalloc_to_page(kaddr);
+ int ret;
+
+ ret = vm_insert_page(vma, uaddr, page);
+ if (ret)
+ return ret;
+
+ uaddr += PAGE_SIZE;
+ kaddr += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ } while (size > 0);
+
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+
+ return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range_partial);
+
+/**
* remap_vmalloc_range - map vmalloc pages to userspace
* @vma: vma to cover (map full range of vma)
* @addr: vmalloc memory
* @pgoff: number of pages into addr before first page to map
- * @returns: 0 for success, -Exxx on failure
+ *
+ * Returns: 0 for success, -Exxx on failure
*
* This function checks that addr is a valid vmalloc'ed area, and
* that it is big enough to cover the vma. Will return failure if
* that criteria isn't met.
*
- * Similar to remap_pfn_range (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
+ return remap_vmalloc_range_partial(vma, vma->vm_start,
+ addr + (pgoff << PAGE_SHIFT),
+ vma->vm_end - vma->vm_start);
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
+
+/*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void __weak vmalloc_sync_all(void)
+{
+}
+
+
+static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
+{
+ pte_t ***p = data;
+
+ if (p) {
+ *(*p) = pte;
+ (*p)++;
+ }
+ return 0;
+}
+
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ * @ptes: returns the PTEs for the address space
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created.
+ *
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
+ */
+struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
+{
struct vm_struct *area;
- unsigned long uaddr = vma->vm_start;
- unsigned long usize = vma->vm_end - vma->vm_start;
- int ret;
- if ((PAGE_SIZE-1) & (unsigned long)addr)
- return -EINVAL;
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (area == NULL)
+ return NULL;
- read_lock(&vmlist_lock);
- area = __find_vm_area(addr);
- if (!area)
- goto out_einval_locked;
+ /*
+ * This ensures that page tables are constructed for this region
+ * of kernel virtual address space and mapped into init_mm.
+ */
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ size, f, ptes ? &ptes : NULL)) {
+ free_vm_area(area);
+ return NULL;
+ }
- if (!(area->flags & VM_USERMAP))
- goto out_einval_locked;
+ return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
- if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
- goto out_einval_locked;
- read_unlock(&vmlist_lock);
+void free_vm_area(struct vm_struct *area)
+{
+ struct vm_struct *ret;
+ ret = remove_vm_area(area->addr);
+ BUG_ON(ret != area);
+ kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
- addr += pgoff << PAGE_SHIFT;
- do {
- struct page *page = vmalloc_to_page(addr);
- ret = vm_insert_page(vma, uaddr, page);
- if (ret)
- return ret;
+#ifdef CONFIG_SMP
+static struct vmap_area *node_to_va(struct rb_node *n)
+{
+ return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
+}
- uaddr += PAGE_SIZE;
- addr += PAGE_SIZE;
- usize -= PAGE_SIZE;
- } while (usize > 0);
+/**
+ * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
+ * @end: target address
+ * @pnext: out arg for the next vmap_area
+ * @pprev: out arg for the previous vmap_area
+ *
+ * Returns: %true if either or both of next and prev are found,
+ * %false if no vmap_area exists
+ *
+ * Find vmap_areas end addresses of which enclose @end. ie. if not
+ * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ */
+static bool pvm_find_next_prev(unsigned long end,
+ struct vmap_area **pnext,
+ struct vmap_area **pprev)
+{
+ struct rb_node *n = vmap_area_root.rb_node;
+ struct vmap_area *va = NULL;
+
+ while (n) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ if (end < va->va_end)
+ n = n->rb_left;
+ else if (end > va->va_end)
+ n = n->rb_right;
+ else
+ break;
+ }
+
+ if (!va)
+ return false;
+
+ if (va->va_end > end) {
+ *pnext = va;
+ *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ } else {
+ *pprev = va;
+ *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
+ }
+ return true;
+}
+
+/**
+ * pvm_determine_end - find the highest aligned address between two vmap_areas
+ * @pnext: in/out arg for the next vmap_area
+ * @pprev: in/out arg for the previous vmap_area
+ * @align: alignment
+ *
+ * Returns: determined end address
+ *
+ * Find the highest aligned address between *@pnext and *@pprev below
+ * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
+ * down address is between the end addresses of the two vmap_areas.
+ *
+ * Please note that the address returned by this function may fall
+ * inside *@pnext vmap_area. The caller is responsible for checking
+ * that.
+ */
+static unsigned long pvm_determine_end(struct vmap_area **pnext,
+ struct vmap_area **pprev,
+ unsigned long align)
+{
+ const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long addr;
+
+ if (*pnext)
+ addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
+ else
+ addr = vmalloc_end;
+
+ while (*pprev && (*pprev)->va_end > addr) {
+ *pnext = *pprev;
+ *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ }
+
+ return addr;
+}
+
+/**
+ * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
+ * @offsets: array containing offset of each area
+ * @sizes: array containing size of each area
+ * @nr_vms: the number of areas to allocate
+ * @align: alignment, all entries in @offsets and @sizes must be aligned to this
+ *
+ * Returns: kmalloc'd vm_struct pointer array pointing to allocated
+ * vm_structs on success, %NULL on failure
+ *
+ * Percpu allocator wants to use congruent vm areas so that it can
+ * maintain the offsets among percpu areas. This function allocates
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
+ *
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans areas from the end looking for
+ * matching slot. While scanning, if any of the areas overlaps with
+ * existing vmap_area, the base address is pulled down to fit the
+ * area. Scanning is repeated till all the areas fit and then all
+ * necessary data structres are inserted and the result is returned.
+ */
+struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
+ const size_t *sizes, int nr_vms,
+ size_t align)
+{
+ const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
+ const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ struct vmap_area **vas, *prev, *next;
+ struct vm_struct **vms;
+ int area, area2, last_area, term_area;
+ unsigned long base, start, end, last_end;
+ bool purged = false;
+
+ /* verify parameters and allocate data structures */
+ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+ for (last_area = 0, area = 0; area < nr_vms; area++) {
+ start = offsets[area];
+ end = start + sizes[area];
+
+ /* is everything aligned properly? */
+ BUG_ON(!IS_ALIGNED(offsets[area], align));
+ BUG_ON(!IS_ALIGNED(sizes[area], align));
+
+ /* detect the area with the highest address */
+ if (start > offsets[last_area])
+ last_area = area;
+
+ for (area2 = 0; area2 < nr_vms; area2++) {
+ unsigned long start2 = offsets[area2];
+ unsigned long end2 = start2 + sizes[area2];
+
+ if (area2 == area)
+ continue;
+
+ BUG_ON(start2 >= start && start2 < end);
+ BUG_ON(end2 <= end && end2 > start);
+ }
+ }
+ last_end = offsets[last_area] + sizes[last_area];
+
+ if (vmalloc_end - vmalloc_start < last_end) {
+ WARN_ON(true);
+ return NULL;
+ }
+
+ vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
+ vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+ if (!vas || !vms)
+ goto err_free2;
+
+ for (area = 0; area < nr_vms; area++) {
+ vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+ if (!vas[area] || !vms[area])
+ goto err_free;
+ }
+retry:
+ spin_lock(&vmap_area_lock);
+
+ /* start scanning - we scan from the top, begin with the last area */
+ area = term_area = last_area;
+ start = offsets[area];
+ end = start + sizes[area];
+
+ if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
+ base = vmalloc_end - last_end;
+ goto found;
+ }
+ base = pvm_determine_end(&next, &prev, align) - end;
+
+ while (true) {
+ BUG_ON(next && next->va_end <= base + end);
+ BUG_ON(prev && prev->va_end > base + end);
+
+ /*
+ * base might have underflowed, add last_end before
+ * comparing.
+ */
+ if (base + last_end < vmalloc_start + last_end) {
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+ goto retry;
+ }
+ goto err_free;
+ }
+
+ /*
+ * If next overlaps, move base downwards so that it's
+ * right below next and then recheck.
+ */
+ if (next && next->va_start < base + end) {
+ base = pvm_determine_end(&next, &prev, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * If prev overlaps, shift down next and prev and move
+ * base so that it's right below new next and then
+ * recheck.
+ */
+ if (prev && prev->va_end > base + start) {
+ next = prev;
+ prev = node_to_va(rb_prev(&next->rb_node));
+ base = pvm_determine_end(&next, &prev, align) - end;
+ term_area = area;
+ continue;
+ }
+
+ /*
+ * This area fits, move on to the previous one. If
+ * the previous one is the terminal one, we're done.
+ */
+ area = (area + nr_vms - 1) % nr_vms;
+ if (area == term_area)
+ break;
+ start = offsets[area];
+ end = start + sizes[area];
+ pvm_find_next_prev(base + end, &next, &prev);
+ }
+found:
+ /* we've found a fitting base, insert all va's */
+ for (area = 0; area < nr_vms; area++) {
+ struct vmap_area *va = vas[area];
- /* Prevent "things" like memory migration? VM_flags need a cleanup... */
- vma->vm_flags |= VM_RESERVED;
+ va->va_start = base + offsets[area];
+ va->va_end = va->va_start + sizes[area];
+ __insert_vmap_area(va);
+ }
+
+ vmap_area_pcpu_hole = base + offsets[last_area];
+
+ spin_unlock(&vmap_area_lock);
+
+ /* insert all vm's */
+ for (area = 0; area < nr_vms; area++)
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ pcpu_get_vm_areas);
+
+ kfree(vas);
+ return vms;
+
+err_free:
+ for (area = 0; area < nr_vms; area++) {
+ kfree(vas[area]);
+ kfree(vms[area]);
+ }
+err_free2:
+ kfree(vas);
+ kfree(vms);
+ return NULL;
+}
+
+/**
+ * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
+ * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
+ * @nr_vms: the number of allocated areas
+ *
+ * Free vm_structs and the array allocated by pcpu_get_vm_areas().
+ */
+void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
+{
+ int i;
+
+ for (i = 0; i < nr_vms; i++)
+ free_vm_area(vms[i]);
+ kfree(vms);
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_area_lock)
+{
+ loff_t n = *pos;
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ while (n > 0 && &va->list != &vmap_area_list) {
+ n--;
+ va = list_entry(va->list.next, typeof(*va), list);
+ }
+ if (!n && &va->list != &vmap_area_list)
+ return va;
+
+ return NULL;
+
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct vmap_area *va = p, *next;
+
+ ++*pos;
+ next = list_entry(va->list.next, typeof(*va), list);
+ if (&next->list != &vmap_area_list)
+ return next;
+
+ return NULL;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmap_area_lock)
+{
+ spin_unlock(&vmap_area_lock);
+}
+
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ unsigned int nr, *counters = m->private;
+
+ if (!counters)
+ return;
+
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+ if (v->flags & VM_UNINITIALIZED)
+ return;
+
+ memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+
+ for (nr = 0; nr < v->nr_pages; nr++)
+ counters[page_to_nid(v->pages[nr])]++;
+
+ for_each_node_state(nr, N_HIGH_MEMORY)
+ if (counters[nr])
+ seq_printf(m, " N%u=%u", nr, counters[nr]);
+ }
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct vmap_area *va = p;
+ struct vm_struct *v;
+
+ /*
+ * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
+ * behalf of vmap area is being tear down or vm_map_ram allocation.
+ */
+ if (!(va->flags & VM_VM_AREA))
+ return 0;
+ v = va->vm;
+
+ seq_printf(m, "0x%pK-0x%pK %7ld",
+ v->addr, v->addr + v->size, v->size);
+
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
+
+ if (v->nr_pages)
+ seq_printf(m, " pages=%d", v->nr_pages);
+
+ if (v->phys_addr)
+ seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
+
+ if (v->flags & VM_IOREMAP)
+ seq_puts(m, " ioremap");
+
+ if (v->flags & VM_ALLOC)
+ seq_puts(m, " vmalloc");
+
+ if (v->flags & VM_MAP)
+ seq_puts(m, " vmap");
+
+ if (v->flags & VM_USERMAP)
+ seq_puts(m, " user");
+
+ if (v->flags & VM_VPAGES)
+ seq_puts(m, " vpages");
+
+ show_numa_info(m, v);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static const struct seq_operations vmalloc_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int vmalloc_open(struct inode *inode, struct file *file)
+{
+ unsigned int *ptr = NULL;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ if (ptr == NULL)
+ return -ENOMEM;
+ }
+ ret = seq_open(file, &vmalloc_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = ptr;
+ } else
+ kfree(ptr);
return ret;
+}
-out_einval_locked:
- read_unlock(&vmlist_lock);
- return -EINVAL;
+static const struct file_operations proc_vmalloc_operations = {
+ .open = vmalloc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+static int __init proc_vmalloc_init(void)
+{
+ proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+ return 0;
}
-EXPORT_SYMBOL(remap_vmalloc_range);
+module_init(proc_vmalloc_init);
+
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+ struct vmap_area *va;
+ unsigned long free_area_size;
+ unsigned long prev_end;
+
+ vmi->used = 0;
+ vmi->largest_chunk = 0;
+
+ prev_end = VMALLOC_START;
+
+ spin_lock(&vmap_area_lock);
+
+ if (list_empty(&vmap_area_list)) {
+ vmi->largest_chunk = VMALLOC_TOTAL;
+ goto out;
+ }
+
+ list_for_each_entry(va, &vmap_area_list, list) {
+ unsigned long addr = va->va_start;
+
+ /*
+ * Some archs keep another range for modules in vmalloc space
+ */
+ if (addr < VMALLOC_START)
+ continue;
+ if (addr >= VMALLOC_END)
+ break;
+
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
+ continue;
+
+ vmi->used += (va->va_end - va->va_start);
+
+ free_area_size = addr - prev_end;
+ if (vmi->largest_chunk < free_area_size)
+ vmi->largest_chunk = free_area_size;
+
+ prev_end = va->va_end;
+ }
+
+ if (VMALLOC_END - prev_end > vmi->largest_chunk)
+ vmi->largest_chunk = VMALLOC_END - prev_end;
+
+out:
+ spin_unlock(&vmap_area_lock);
+}
+#endif
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 00000000000..d4042e75f7c
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,380 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ * Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0 : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+ return container_of(work, struct vmpressure, work);
+}
+
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+ struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return memcg_to_vmpressure(memcg);
+}
+
+enum vmpressure_levels {
+ VMPRESSURE_LOW = 0,
+ VMPRESSURE_MEDIUM,
+ VMPRESSURE_CRITICAL,
+ VMPRESSURE_NUM_LEVELS,
+};
+
+static const char * const vmpressure_str_levels[] = {
+ [VMPRESSURE_LOW] = "low",
+ [VMPRESSURE_MEDIUM] = "medium",
+ [VMPRESSURE_CRITICAL] = "critical",
+};
+
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+ if (pressure >= vmpressure_level_critical)
+ return VMPRESSURE_CRITICAL;
+ else if (pressure >= vmpressure_level_med)
+ return VMPRESSURE_MEDIUM;
+ return VMPRESSURE_LOW;
+}
+
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+ unsigned long reclaimed)
+{
+ unsigned long scale = scanned + reclaimed;
+ unsigned long pressure;
+
+ /*
+ * We calculate the ratio (in percents) of how many pages were
+ * scanned vs. reclaimed in a given time frame (window). Note that
+ * time is in VM reclaimer's "ticks", i.e. number of pages
+ * scanned. This makes it possible to set desired reaction time
+ * and serves as a ratelimit.
+ */
+ pressure = scale - (reclaimed * scale / scanned);
+ pressure = pressure * 100 / scale;
+
+ pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
+ scanned, reclaimed);
+
+ return vmpressure_level(pressure);
+}
+
+struct vmpressure_event {
+ struct eventfd_ctx *efd;
+ enum vmpressure_levels level;
+ struct list_head node;
+};
+
+static bool vmpressure_event(struct vmpressure *vmpr,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ struct vmpressure_event *ev;
+ enum vmpressure_levels level;
+ bool signalled = false;
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ mutex_lock(&vmpr->events_lock);
+
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (level >= ev->level) {
+ eventfd_signal(ev->efd, 1);
+ signalled = true;
+ }
+ }
+
+ mutex_unlock(&vmpr->events_lock);
+
+ return signalled;
+}
+
+static void vmpressure_work_fn(struct work_struct *work)
+{
+ struct vmpressure *vmpr = work_to_vmpressure(work);
+ unsigned long scanned;
+ unsigned long reclaimed;
+
+ /*
+ * Several contexts might be calling vmpressure(), so it is
+ * possible that the work was rescheduled again before the old
+ * work context cleared the counters. In that case we will run
+ * just after the old work returns, but then scanned might be zero
+ * here. No need for any locks here since we don't care if
+ * vmpr->reclaimed is in sync.
+ */
+ if (!vmpr->scanned)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned;
+ reclaimed = vmpr->reclaimed;
+ vmpr->scanned = 0;
+ vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ do {
+ if (vmpressure_event(vmpr, scanned, reclaimed))
+ break;
+ /*
+ * If not handled, propagate the event upward into the
+ * hierarchy.
+ */
+ } while ((vmpr = vmpressure_parent(vmpr)));
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @scanned: number of pages scanned
+ * @reclaimed: number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+
+ /*
+ * Here we only want to account pressure that userland is able to
+ * help us with. For example, suppose that DMA zone is under
+ * pressure; if we notify userland about that kind of pressure,
+ * then it will be mostly a waste as it will trigger unnecessary
+ * freeing of memory by userland (since userland is more likely to
+ * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+ * is why we include only movable, highmem and FS/IO pages.
+ * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+ * we account it too.
+ */
+ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+ return;
+
+ /*
+ * If we got here with no pages scanned, then that is an indicator
+ * that reclaimer was unable to find any shrinkable LRUs at the
+ * current scanning depth. But it does not mean that we should
+ * report the critical pressure, yet. If the scanning priority
+ * (scanning depth) goes too high (deep), we will be notified
+ * through vmpressure_prio(). But so far, keep calm.
+ */
+ if (!scanned)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ vmpr->scanned += scanned;
+ vmpr->reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ spin_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+}
+
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @prio: reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+ /*
+ * We only use prio for accounting critical level. For more info
+ * see comment for vmpressure_level_critical_prio variable above.
+ */
+ if (prio > vmpressure_level_critical_prio)
+ return;
+
+ /*
+ * OK, the prio is below the threshold, updating vmpressure
+ * information before shrinker dives into long shrinking of long
+ * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+ * to the vmpressure() basically means that we signal 'critical'
+ * level.
+ */
+ vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @memcg: memcg that is interested in vmpressure notifications
+ * @eventfd: eventfd context to link notifications with
+ * @args: event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * To be used as memcg event method.
+ */
+int vmpressure_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure_event *ev;
+ int level;
+
+ for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+ if (!strcmp(vmpressure_str_levels[level], args))
+ break;
+ }
+
+ if (level >= VMPRESSURE_NUM_LEVELS)
+ return -EINVAL;
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev)
+ return -ENOMEM;
+
+ ev->efd = eventfd;
+ ev->level = level;
+
+ mutex_lock(&vmpr->events_lock);
+ list_add(&ev->node, &vmpr->events);
+ mutex_unlock(&vmpr->events_lock);
+
+ return 0;
+}
+
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @memcg: memcg handle
+ * @eventfd: eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * To be used as memcg event method.
+ */
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ struct vmpressure_event *ev;
+
+ mutex_lock(&vmpr->events_lock);
+ list_for_each_entry(ev, &vmpr->events, node) {
+ if (ev->efd != eventfd)
+ continue;
+ list_del(&ev->node);
+ kfree(ev);
+ break;
+ }
+ mutex_unlock(&vmpr->events_lock);
+}
+
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr: Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+ spin_lock_init(&vmpr->sr_lock);
+ mutex_init(&vmpr->events_lock);
+ INIT_LIST_HEAD(&vmpr->events);
+ INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
+
+/**
+ * vmpressure_cleanup() - shuts down vmpressure control structure
+ * @vmpr: Structure to be cleaned up
+ *
+ * This function should be called before the structure in which it is
+ * embedded is cleaned up.
+ */
+void vmpressure_cleanup(struct vmpressure *vmpr)
+{
+ /*
+ * Make sure there is no pending work before eventfd infrastructure
+ * goes away.
+ */
+ flush_work(&vmpr->work);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eca70310adb..0f16ffe8eb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -11,14 +11,17 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
@@ -26,56 +29,77 @@
#include <linux/buffer_head.h> /* for try_to_release_page(),
buffer_heads_over_limit */
#include <linux/mm_inline.h>
-#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/memcontrol.h>
+#include <linux/delayacct.h>
+#include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
+
+ /* How many pages shrink_list() should reclaim */
+ unsigned long nr_to_reclaim;
+
+ unsigned long hibernation_mode;
+
/* This context's GFP mask */
gfp_t gfp_mask;
int may_writepage;
+ /* Can mapped pages be reclaimed? */
+ int may_unmap;
+
/* Can pages be swapped as part of reclaim? */
int may_swap;
- /* This context's SWAP_CLUSTER_MAX. If freeing memory for
- * suspend, we effectively ignore SWAP_CLUSTER_MAX.
- * In this context, it doesn't matter that we scan the
- * whole list at once. */
- int swap_cluster_max;
+ int order;
+ /* Scan (total_size >> priority) pages at once */
+ int priority;
+
+ /* anon vs. file LRUs scanning "ratio" */
int swappiness;
- int all_unreclaimable;
-};
+ /*
+ * The memory cgroup that hit its limit and as a result is the
+ * primary target of this reclaim invocation.
+ */
+ struct mem_cgroup *target_mem_cgroup;
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
- shrinker_t shrinker;
- struct list_head list;
- int seeks; /* seeks to recreate an obj */
- long nr; /* objs pending delete */
+ /*
+ * Nodemask of nodes allowed by the caller. If NULL, all nodes
+ * are scanned.
+ */
+ nodemask_t *nodemask;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -112,44 +136,204 @@ struct shrinker {
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-long vm_total_pages; /* The total number of pages which the VM controls */
+unsigned long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_MEMCG
+static bool global_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup;
+}
+#else
+static bool global_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+#endif
+
+static unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+ int nr;
+
+ nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_FILE);
+
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+
+ return nr;
+}
+
+bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_get_lru_size(lruvec, lru);
+
+ return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+}
+
/*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
*/
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
-{
- struct shrinker *shrinker;
-
- shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
- if (shrinker) {
- shrinker->shrinker = theshrinker;
- shrinker->seeks = seeks;
- shrinker->nr = 0;
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- up_write(&shrinker_rwsem);
- }
- return shrinker;
+int register_shrinker(struct shrinker *shrinker)
+{
+ size_t size = sizeof(*shrinker->nr_deferred);
+
+ /*
+ * If we only have one possible node in the system anyway, save
+ * ourselves the trouble and disable NUMA aware behavior. This way we
+ * will save memory and some small loop time later.
+ */
+ if (nr_node_ids == 1)
+ shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+ if (shrinker->flags & SHRINKER_NUMA_AWARE)
+ size *= nr_node_ids;
+
+ shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+ if (!shrinker->nr_deferred)
+ return -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+ list_add_tail(&shrinker->list, &shrinker_list);
+ up_write(&shrinker_rwsem);
+ return 0;
}
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
/*
* Remove one
*/
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
{
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
- kfree(shrinker);
+ kfree(shrinker->nr_deferred);
}
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
+
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+ unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+ unsigned long freed = 0;
+ unsigned long long delta;
+ long total_scan;
+ long freeable;
+ long nr;
+ long new_nr;
+ int nid = shrinkctl->nid;
+ long batch_size = shrinker->batch ? shrinker->batch
+ : SHRINK_BATCH;
+
+ freeable = shrinker->count_objects(shrinker, shrinkctl);
+ if (freeable == 0)
+ return 0;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+ total_scan = nr;
+ delta = (4 * nr_pages_scanned) / shrinker->seeks;
+ delta *= freeable;
+ do_div(delta, lru_pages + 1);
+ total_scan += delta;
+ if (total_scan < 0) {
+ printk(KERN_ERR
+ "shrink_slab: %pF negative objects to delete nr=%ld\n",
+ shrinker->scan_objects, total_scan);
+ total_scan = freeable;
+ }
+
+ /*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * freeable. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < freeable / 4)
+ total_scan = min(total_scan, freeable / 2);
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (total_scan > freeable * 2)
+ total_scan = freeable * 2;
+
+ trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+ nr_pages_scanned, lru_pages,
+ freeable, delta, total_scan);
+
+ /*
+ * Normally, we should not scan less than batch_size objects in one
+ * pass to avoid too frequent shrinker calls, but if the slab has less
+ * than batch_size objects in total and we are really tight on memory,
+ * we will try to reclaim all available objects, otherwise we can end
+ * up failing allocations although there are plenty of reclaimable
+ * objects spread over several slabs with usage less than the
+ * batch_size.
+ *
+ * We detect the "tight on memory" situations by looking at the total
+ * number of objects we want to scan (total_scan). If it is greater
+ * than the total number of objects on slab (freeable), we must be
+ * scanning at high prio and therefore should try to reclaim as much as
+ * possible.
+ */
+ while (total_scan >= batch_size ||
+ total_scan >= freeable) {
+ unsigned long ret;
+ unsigned long nr_to_scan = min(batch_size, total_scan);
+
+ shrinkctl->nr_to_scan = nr_to_scan;
+ ret = shrinker->scan_objects(shrinker, shrinkctl);
+ if (ret == SHRINK_STOP)
+ break;
+ freed += ret;
+
+ count_vm_events(SLABS_SCANNED, nr_to_scan);
+ total_scan -= nr_to_scan;
+
+ cond_resched();
+ }
+
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ if (total_scan > 0)
+ new_nr = atomic_long_add_return(total_scan,
+ &shrinker->nr_deferred[nid]);
+ else
+ new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+
+ trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+ return freed;
+}
+
/*
* Call the shrink functions to age shrinkable caches
*
@@ -158,7 +342,7 @@ EXPORT_SYMBOL(remove_shrinker);
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
- * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
@@ -169,94 +353,60 @@ EXPORT_SYMBOL(remove_shrinker);
*
* Returns the number of slab objects which we shrunk.
*/
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
+ unsigned long nr_pages_scanned,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
- unsigned long ret = 0;
-
- if (scanned == 0)
- scanned = SWAP_CLUSTER_MAX;
+ unsigned long freed = 0;
- if (!down_read_trylock(&shrinker_rwsem))
- return 1; /* Assume we'll be able to shrink next time */
-
- list_for_each_entry(shrinker, &shrinker_list, list) {
- unsigned long long delta;
- unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
-
- delta = (4 * scanned) / shrinker->seeks;
- delta *= max_pass;
- do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
- printk(KERN_ERR "%s: nr=%ld\n",
- __FUNCTION__, shrinker->nr);
- shrinker->nr = max_pass;
- }
+ if (nr_pages_scanned == 0)
+ nr_pages_scanned = SWAP_CLUSTER_MAX;
+ if (!down_read_trylock(&shrinker_rwsem)) {
/*
- * Avoid risking looping forever due to too large nr value:
- * never try to free more than twice the estimate number of
- * freeable entries.
+ * If we would return 0, our callers would understand that we
+ * have nothing else to shrink and give up trying. By returning
+ * 1 we keep it going and assume we'll be able to shrink next
+ * time.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
-
- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ freed = 1;
+ goto out;
+ }
- while (total_scan >= SHRINK_BATCH) {
- long this_scan = SHRINK_BATCH;
- int shrink_ret;
- int nr_before;
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ shrinkctl->nid = 0;
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
+ continue;
+ }
- nr_before = (*shrinker->shrinker)(0, gfp_mask);
- shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
- if (shrink_ret == -1)
- break;
- if (shrink_ret < nr_before)
- ret += nr_before - shrink_ret;
- count_vm_events(SLABS_SCANNED, this_scan);
- total_scan -= this_scan;
+ for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
+ if (node_online(shrinkctl->nid))
+ freed += shrink_slab_node(shrinkctl, shrinker,
+ nr_pages_scanned, lru_pages);
- cond_resched();
}
-
- shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
- return ret;
-}
-
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
-
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
+out:
+ cond_resched();
+ return freed;
}
static inline int is_page_cache_freeable(struct page *page)
{
- return page_count(page) - !!PagePrivate(page) == 2;
+ /*
+ * A freeable page cache page is referenced only by the caller
+ * that isolated the page, the page cache radix tree and
+ * optional buffer heads at page->private.
+ */
+ return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+ struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -283,12 +433,8 @@ static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
lock_page(page);
- if (page_mapping(page) == mapping) {
- if (error == -ENOSPC)
- set_bit(AS_ENOSPC, &mapping->flags);
- else
- set_bit(AS_EIO, &mapping->flags);
- }
+ if (page_mapping(page) == mapping)
+ mapping_set_error(mapping, error);
unlock_page(page);
}
@@ -308,7 +454,8 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+ struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -317,7 +464,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
- * If this process is currently in generic_file_write() against
+ * If this process is currently in __generic_file_write_iter() against
* this page's queue, we can perform writeback even if that
* will block.
*
@@ -325,7 +472,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
- * See swapfile.c:page_queue_congested().
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
@@ -334,10 +480,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
- if (PagePrivate(page)) {
+ if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
- printk("%s: orphaned page\n", __FUNCTION__);
+ pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
@@ -345,7 +491,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info))
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -355,7 +501,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1,
};
@@ -367,10 +512,12 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
+
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -378,12 +525,17 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
return PAGE_CLEAN;
}
-int remove_mapping(struct address_space *mapping, struct page *page)
+/*
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
+ */
+static int __remove_mapping(struct address_space *mapping, struct page *page,
+ bool reclaimed)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
@@ -409,109 +561,456 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (unlikely(page_count(page) != 2))
+ if (!page_freeze_refs(page, 2))
goto cannot_free;
- smp_rmb();
- if (unlikely(PageDirty(page)))
+ /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ if (unlikely(PageDirty(page))) {
+ page_unfreeze_refs(page, 2);
goto cannot_free;
+ }
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- return 1;
+ spin_unlock_irq(&mapping->tree_lock);
+ swapcache_free(swap, page);
+ } else {
+ void (*freepage)(struct page *);
+ void *shadow = NULL;
+
+ freepage = mapping->a_ops->freepage;
+ /*
+ * Remember a shadow entry for reclaimed file cache in
+ * order to detect refaults, thus thrashing, later on.
+ *
+ * But don't store shadows in an address space that is
+ * already exiting. This is not just an optizimation,
+ * inode reclaim needs to empty out the radix tree or
+ * the nodes are lost. Don't plant shadows behind its
+ * back.
+ */
+ if (reclaimed && page_is_file_cache(page) &&
+ !mapping_exiting(mapping))
+ shadow = workingset_eviction(mapping, page);
+ __delete_from_page_cache(page, shadow);
+ spin_unlock_irq(&mapping->tree_lock);
+ mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
return 1;
cannot_free:
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
return 0;
}
/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ if (__remove_mapping(mapping, page, false)) {
+ /*
+ * Unfreezing the refcount with 1 rather than 2 effectively
+ * drops the pagecache ref for us without requiring another
+ * atomic operation.
+ */
+ page_unfreeze_refs(page, 1);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+void putback_lru_page(struct page *page)
+{
+ bool is_unevictable;
+ int was_unevictable = PageUnevictable(page);
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+redo:
+ ClearPageUnevictable(page);
+
+ if (page_evictable(page)) {
+ /*
+ * For evictable pages, we can use the cache.
+ * In event of a race, worst case is we end up with an
+ * unevictable page on [in]active list.
+ * We know how to handle that.
+ */
+ is_unevictable = false;
+ lru_cache_add(page);
+ } else {
+ /*
+ * Put unevictable pages directly on zone's unevictable
+ * list.
+ */
+ is_unevictable = true;
+ add_page_to_unevictable_list(page);
+ /*
+ * When racing with an mlock or AS_UNEVICTABLE clearing
+ * (page is unlocked) make sure that if the other thread
+ * does not observe our setting of PG_lru and fails
+ * isolation/check_move_unevictable_pages,
+ * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
+ * the page back to the evictable list.
+ *
+ * The other side is TestClearPageMlocked() or shmem_lock().
+ */
+ smp_mb();
+ }
+
+ /*
+ * page's status can change while we move it among lru. If an evictable
+ * page is on unevictable list, it never be freed. To avoid that,
+ * check after we added it to the list, again.
+ */
+ if (is_unevictable && page_evictable(page)) {
+ if (!isolate_lru_page(page)) {
+ put_page(page);
+ goto redo;
+ }
+ /* This means someone else dropped this page from LRU
+ * So, it will be freed or putback to LRU again. There is
+ * nothing to do here.
+ */
+ }
+
+ if (was_unevictable && !is_unevictable)
+ count_vm_event(UNEVICTABLE_PGRESCUED);
+ else if (!was_unevictable && is_unevictable)
+ count_vm_event(UNEVICTABLE_PGCULLED);
+
+ put_page(page); /* drop ref from isolate */
+}
+
+enum page_references {
+ PAGEREF_RECLAIM,
+ PAGEREF_RECLAIM_CLEAN,
+ PAGEREF_KEEP,
+ PAGEREF_ACTIVATE,
+};
+
+static enum page_references page_check_references(struct page *page,
+ struct scan_control *sc)
+{
+ int referenced_ptes, referenced_page;
+ unsigned long vm_flags;
+
+ referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+ &vm_flags);
+ referenced_page = TestClearPageReferenced(page);
+
+ /*
+ * Mlock lost the isolation race with us. Let try_to_unmap()
+ * move the page to the unevictable list.
+ */
+ if (vm_flags & VM_LOCKED)
+ return PAGEREF_RECLAIM;
+
+ if (referenced_ptes) {
+ if (PageSwapBacked(page))
+ return PAGEREF_ACTIVATE;
+ /*
+ * All mapped pages start out with page table
+ * references from the instantiating fault, so we need
+ * to look twice if a mapped file page is used more
+ * than once.
+ *
+ * Mark it and spare it for another trip around the
+ * inactive list. Another page table reference will
+ * lead to its activation.
+ *
+ * Note: the mark is set for activated pages as well
+ * so that recently deactivated but used pages are
+ * quickly recovered.
+ */
+ SetPageReferenced(page);
+
+ if (referenced_page || referenced_ptes > 1)
+ return PAGEREF_ACTIVATE;
+
+ /*
+ * Activate file-backed executable pages after first usage.
+ */
+ if (vm_flags & VM_EXEC)
+ return PAGEREF_ACTIVATE;
+
+ return PAGEREF_KEEP;
+ }
+
+ /* Reclaim if clean, defer dirty pages to writeback */
+ if (referenced_page && !PageSwapBacked(page))
+ return PAGEREF_RECLAIM_CLEAN;
+
+ return PAGEREF_RECLAIM;
+}
+
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ struct address_space *mapping;
+
+ /*
+ * Anonymous pages are not handled by flushers and must be written
+ * from reclaim context. Do not stall reclaim based on them
+ */
+ if (!page_is_file_cache(page)) {
+ *dirty = false;
+ *writeback = false;
+ return;
+ }
+
+ /* By default assume that the page flags are accurate */
+ *dirty = PageDirty(page);
+ *writeback = PageWriteback(page);
+
+ /* Verify dirty/writeback state if the filesystem supports it */
+ if (!page_has_private(page))
+ return;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops->is_dirty_writeback)
+ mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+}
+
+/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc)
+ struct zone *zone,
+ struct scan_control *sc,
+ enum ttu_flags ttu_flags,
+ unsigned long *ret_nr_dirty,
+ unsigned long *ret_nr_unqueued_dirty,
+ unsigned long *ret_nr_congested,
+ unsigned long *ret_nr_writeback,
+ unsigned long *ret_nr_immediate,
+ bool force_reclaim)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_unqueued_dirty = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_writeback = 0;
+ unsigned long nr_immediate = 0;
cond_resched();
- pagevec_init(&freed_pvec, 1);
+ mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- int referenced;
+ enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ bool dirty, writeback;
cond_resched();
page = lru_to_page(page_list);
list_del(&page->lru);
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
- VM_BUG_ON(PageActive(page));
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
- if (!sc->may_swap && page_mapped(page))
+ if (unlikely(!page_evictable(page)))
+ goto cull_mlocked;
+
+ if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
- if (PageWriteback(page))
- goto keep_locked;
+ may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+ /*
+ * The number of dirty pages determines if a zone is marked
+ * reclaim_congested which affects wait_iff_congested. kswapd
+ * will stall and start writing pages if the tail of the LRU
+ * is all dirty unqueued pages.
+ */
+ page_check_dirty_writeback(page, &dirty, &writeback);
+ if (dirty || writeback)
+ nr_dirty++;
+
+ if (dirty && !writeback)
+ nr_unqueued_dirty++;
+
+ /*
+ * Treat this page as congested if the underlying BDI is or if
+ * pages are cycling through the LRU so quickly that the
+ * pages marked for immediate reclaim are making it to the
+ * end of the LRU a second time.
+ */
+ mapping = page_mapping(page);
+ if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
+ (writeback && PageReclaim(page)))
+ nr_congested++;
+
+ /*
+ * If a page at the tail of the LRU is under writeback, there
+ * are three cases to consider.
+ *
+ * 1) If reclaim is encountering an excessive number of pages
+ * under writeback and this page is both under writeback and
+ * PageReclaim then it indicates that pages are being queued
+ * for IO but are being recycled through the LRU before the
+ * IO can complete. Waiting on the page itself risks an
+ * indefinite stall if it is impossible to writeback the
+ * page due to IO error or disconnected storage so instead
+ * note that the LRU is being scanned too quickly and the
+ * caller can stall after page list has been processed.
+ *
+ * 2) Global reclaim encounters a page, memcg encounters a
+ * page that is not marked for immediate reclaim or
+ * the caller does not have __GFP_IO. In this case mark
+ * the page for immediate reclaim and continue scanning.
+ *
+ * __GFP_IO is checked because a loop driver thread might
+ * enter reclaim, and deadlock if it waits on a page for
+ * which it is needed to do the write (loop masks off
+ * __GFP_IO|__GFP_FS for this reason); but more thought
+ * would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into the
+ * FS, just waiting on its writeback completion. Worryingly,
+ * ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+ * may_enter_fs here is liable to OOM on them.
+ *
+ * 3) memcg encounters a page that is not already marked
+ * PageReclaim. memcg does not have any dirty pages
+ * throttling so we could easily OOM just because too many
+ * pages are in writeback and there is nothing else to
+ * reclaim. Wait for the writeback to complete.
+ */
+ if (PageWriteback(page)) {
+ /* Case 1 above */
+ if (current_is_kswapd() &&
+ PageReclaim(page) &&
+ zone_is_reclaim_writeback(zone)) {
+ nr_immediate++;
+ goto keep_locked;
+
+ /* Case 2 above */
+ } else if (global_reclaim(sc) ||
+ !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ /*
+ * This is slightly racy - end_page_writeback()
+ * might have just cleared PageReclaim, then
+ * setting PageReclaim here end up interpreted
+ * as PageReadahead - but that does not matter
+ * enough to care. What we do want is for this
+ * page to have PageReclaim set next time memcg
+ * reclaim reaches the tests above, so it will
+ * then wait_on_page_writeback() to avoid OOM;
+ * and it's also appropriate in global reclaim.
+ */
+ SetPageReclaim(page);
+ nr_writeback++;
+
+ goto keep_locked;
+
+ /* Case 3 above */
+ } else {
+ wait_on_page_writeback(page);
+ }
+ }
+
+ if (!force_reclaim)
+ references = page_check_references(page, sc);
- referenced = page_referenced(page, 1);
- /* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+ switch (references) {
+ case PAGEREF_ACTIVATE:
goto activate_locked;
+ case PAGEREF_KEEP:
+ goto keep_locked;
+ case PAGEREF_RECLAIM:
+ case PAGEREF_RECLAIM_CLEAN:
+ ; /* try to reclaim the page below */
+ }
-#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page))
- if (!add_to_swap(page, GFP_ATOMIC))
+ if (PageAnon(page) && !PageSwapCache(page)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (!add_to_swap(page, page_list))
goto activate_locked;
-#endif /* CONFIG_SWAP */
+ may_enter_fs = 1;
- mapping = page_mapping(page);
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ }
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, 0)) {
+ switch (try_to_unmap(page, ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
+ case SWAP_MLOCK:
+ goto cull_mlocked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
if (PageDirty(page)) {
- if (referenced)
+ /*
+ * Only kswapd can writeback filesystem pages to
+ * avoid risk of stack overflow but only writeback
+ * if many dirty pages have been encountered.
+ */
+ if (page_is_file_cache(page) &&
+ (!current_is_kswapd() ||
+ !zone_is_reclaim_dirty(zone))) {
+ /*
+ * Immediately reclaim when written back.
+ * Similar in principal to deactivate_page()
+ * except we already have the page isolated
+ * and know it's dirty
+ */
+ inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
+
+ goto keep_locked;
+ }
+
+ if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -519,19 +1018,22 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch(pageout(page, mapping)) {
+ switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page))
goto keep;
+ if (PageDirty(page))
+ goto keep;
+
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
@@ -551,7 +1053,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* possible for a page to have PageDirty set, but it is actually
* clean (all its buffers are clean). This happens if the
* buffers were written out directly, with submit_bh(). ext3
- * will do this, as well as the blockdev mapping.
+ * will do this, as well as the blockdev mapping.
* try_to_release_page() will discover that cleanness and will
* drop the buffers and mark the page clean - it can be freed.
*
@@ -562,39 +1064,184 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
*/
- if (PagePrivate(page)) {
+ if (page_has_private(page)) {
if (!try_to_release_page(page, sc->gfp_mask))
goto activate_locked;
- if (!mapping && page_count(page) == 1)
- goto free_it;
+ if (!mapping && page_count(page) == 1) {
+ unlock_page(page);
+ if (put_page_testzero(page))
+ goto free_it;
+ else {
+ /*
+ * rare race with speculative reference.
+ * the speculative reference will free
+ * this page shortly, so we may
+ * increment nr_reclaimed here (and
+ * leave it off the LRU).
+ */
+ nr_reclaimed++;
+ continue;
+ }
+ }
}
- if (!mapping || !remove_mapping(mapping, page))
+ if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
+ /*
+ * At this point, we have no other references and there is
+ * no way to pick any more up (removed from LRU, removed
+ * from pagecache). Can use non-atomic bitops now (and
+ * we obviously don't have to worry about waking up a process
+ * waiting on the page lock, because there are no references.
+ */
+ __clear_page_locked(page);
free_it:
- unlock_page(page);
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page))
- __pagevec_release_nonlru(&freed_pvec);
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
+ continue;
+
+cull_mlocked:
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ unlock_page(page);
+ putback_lru_page(page);
continue;
activate_locked:
+ /* Not a candidate for swapping, so reclaim swap space. */
+ if (PageSwapCache(page) && vm_swap_full())
+ try_to_free_swap(page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
pgactivate++;
keep_locked:
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+
+ free_hot_cold_page_list(&free_pages, true);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_release_nonlru(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
+ mem_cgroup_uncharge_end();
+ *ret_nr_dirty += nr_dirty;
+ *ret_nr_congested += nr_congested;
+ *ret_nr_unqueued_dirty += nr_unqueued_dirty;
+ *ret_nr_writeback += nr_writeback;
+ *ret_nr_immediate += nr_immediate;
return nr_reclaimed;
}
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list)
+{
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
+ };
+ unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
+ struct page *page, *next;
+ LIST_HEAD(clean_pages);
+
+ list_for_each_entry_safe(page, next, page_list, lru) {
+ if (page_is_file_cache(page) && !PageDirty(page) &&
+ !isolated_balloon_page(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &clean_pages);
+ }
+ }
+
+ ret = shrink_page_list(&clean_pages, zone, &sc,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+ list_splice(&clean_pages, page_list);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ return ret;
+}
+
+/*
+ * Attempt to remove the specified page from its LRU. Only take this page
+ * if it is of the appropriate PageActive status. Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page: page to consider
+ * mode: one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
+{
+ int ret = -EINVAL;
+
+ /* Only take pages on the LRU. */
+ if (!PageLRU(page))
+ return ret;
+
+ /* Compaction should not handle unevictable pages but CMA can do so */
+ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
+ return ret;
+
+ ret = -EBUSY;
+
+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_CLEAN means that only clean pages should be isolated. This
+ * is used by reclaim when it is cannot write to backing storage
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+
+ /* ISOLATE_CLEAN means only clean pages */
+ if (mode & ISOLATE_CLEAN)
+ return ret;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking
+ */
+ mapping = page_mapping(page);
+ if (mapping && !mapping->a_ops->migratepage)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+
+ return ret;
+}
+
/*
* zone->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
@@ -606,120 +1253,356 @@ keep:
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
- * @src: The LRU list to pull pages off.
+ * @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
- * @scanned: The number of pages that were scanned.
+ * @nr_scanned: The number of pages that were scanned.
+ * @sc: The scan_control struct for this reclaim session
+ * @mode: One of the LRU isolation modes
+ * @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
- struct list_head *src, struct list_head *dst,
- unsigned long *scanned)
+ struct lruvec *lruvec, struct list_head *dst,
+ unsigned long *nr_scanned, struct scan_control *sc,
+ isolate_mode_t mode, enum lru_list lru)
{
+ struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
- struct page *page;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- struct list_head *target;
+ struct page *page;
+ int nr_pages;
+
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
- list_del(&page->lru);
- target = src;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- target = dst;
- nr_taken++;
- } /* else it is being freed elsewhere */
+ switch (__isolate_lru_page(page, mode)) {
+ case 0:
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+ list_move(&page->lru, dst);
+ nr_taken += nr_pages;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
- list_add(&page->lru, target);
+ default:
+ BUG();
+ }
}
- *scanned = scan;
+ *nr_scanned = scan;
+ trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+ nr_taken, mode, is_file_lru(lru));
return nr_taken;
}
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared. If it was found on
+ * the active list, it will have PageActive set. If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ * (1) Must be called with an elevated refcount on the page. This is a
+ * fundamentnal difference from isolate_lru_pages (which is called
+ * without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+ int ret = -EBUSY;
+
+ VM_BUG_ON_PAGE(!page_count(page), page);
+
+ if (PageLRU(page)) {
+ struct zone *zone = page_zone(page);
+ struct lruvec *lruvec;
+
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+ if (PageLRU(page)) {
+ int lru = page_lru(page);
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, lru);
+ ret = 0;
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+ return ret;
+}
+
+/*
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
+ */
+static int too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc)
+{
+ unsigned long inactive, isolated;
+
+ if (current_is_kswapd())
+ return 0;
+
+ if (!global_reclaim(sc))
+ return 0;
+
+ if (file) {
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
+static noinline_for_stack void
+putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+{
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ struct zone *zone = lruvec_zone(lruvec);
+ LIST_HEAD(pages_to_free);
+
+ /*
+ * Put back any unfreeable pages.
+ */
+ while (!list_empty(page_list)) {
+ struct page *page = lru_to_page(page_list);
+ int lru;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
+ }
+ if (put_page_testzero(page)) {
+ __ClearPageLRU(page);
+ __ClearPageActive(page);
+ del_page_from_lru_list(page, lruvec, lru);
+
+ if (unlikely(PageCompound(page))) {
+ spin_unlock_irq(&zone->lru_lock);
+ (*get_compound_page_dtor(page))(page);
+ spin_lock_irq(&zone->lru_lock);
+ } else
+ list_add(&page->lru, &pages_to_free);
+ }
+ }
+
+ /*
+ * To save our caller's stack, now use input list for pages to free.
+ */
+ list_splice(&pages_to_free, page_list);
+}
+
+/*
+ * If a kernel thread (such as nfsd for loop-back mounts) services
+ * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * In that case we should only throttle if the backing device it is
+ * writing to is congested. In other cases it is safe to throttle.
+ */
+static int current_may_throttle(void)
+{
+ return !(current->flags & PF_LESS_THROTTLE) ||
+ current->backing_dev_info == NULL ||
+ bdi_write_congested(current->backing_dev_info);
+}
+
/*
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc)
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
- struct pagevec pvec;
- unsigned long nr_scanned = 0;
+ unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
-
- pagevec_init(&pvec, 1);
+ unsigned long nr_taken;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
+ unsigned long nr_unqueued_dirty = 0;
+ unsigned long nr_writeback = 0;
+ unsigned long nr_immediate = 0;
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }
lru_add_drain();
+
+ if (!sc->may_unmap)
+ isolate_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ isolate_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
-
- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
- &page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
- zone->pages_scanned += nr_scan;
- spin_unlock_irq(&zone->lru_lock);
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc);
- nr_reclaimed += nr_freed;
- local_irq_disable();
- if (current_is_kswapd()) {
- __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- } else
- __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
- __count_vm_events(PGACTIVATE, nr_freed);
-
- if (nr_taken == 0)
- goto done;
-
- spin_lock(&zone->lru_lock);
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ &nr_scanned, sc, isolate_mode, lru);
+
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
+ if (global_reclaim(sc)) {
+ zone->pages_scanned += nr_scanned;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ if (nr_taken == 0)
+ return 0;
+
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ &nr_dirty, &nr_unqueued_dirty, &nr_congested,
+ &nr_writeback, &nr_immediate,
+ false);
+
+ spin_lock_irq(&zone->lru_lock);
+
+ reclaim_stat->recent_scanned[file] += nr_taken;
+
+ if (global_reclaim(sc)) {
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+ nr_reclaimed);
+ else
+ __count_zone_vm_events(PGSTEAL_DIRECT, zone,
+ nr_reclaimed);
+ }
+
+ putback_inactive_pages(lruvec, &page_list);
+
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+
+ spin_unlock_irq(&zone->lru_lock);
+
+ free_hot_cold_page_list(&page_list, true);
+
+ /*
+ * If reclaim is isolating dirty pages under writeback, it implies
+ * that the long-lived page allocation rate is exceeding the page
+ * laundering rate. Either the global limits are not being effective
+ * at throttling processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing device. The
+ * only option is to throttle from reclaim context which is not ideal
+ * as there is no guarantee the dirtying process is throttled in the
+ * same way balance_dirty_pages() manages.
+ *
+ * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
+ * of pages under pages flagged for immediate reclaim and stall if any
+ * are encountered in the nr_immediate check below.
+ */
+ if (nr_writeback && nr_writeback == nr_taken)
+ zone_set_flag(zone, ZONE_WRITEBACK);
+
+ /*
+ * memcg will stall in page writeback so only consider forcibly
+ * stalling for global reclaim
+ */
+ if (global_reclaim(sc)) {
/*
- * Put back any unfreeable pages.
+ * Tag a zone as congested if all the dirty pages scanned were
+ * backed by a congested BDI and wait_iff_congested will stall.
*/
- while (!list_empty(&page_list)) {
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- list_del(&page->lru);
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- } while (nr_scanned < max_scan);
- spin_unlock(&zone->lru_lock);
-done:
- local_irq_enable();
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
+ if (nr_dirty && nr_dirty == nr_congested)
+ zone_set_flag(zone, ZONE_CONGESTED);
-static inline int zone_is_near_oom(struct zone *zone)
-{
- return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+ /*
+ * If dirty pages are scanned that are not queued for IO, it
+ * implies that flushers are not keeping up. In this case, flag
+ * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+ * pages from reclaim context.
+ */
+ if (nr_unqueued_dirty == nr_taken)
+ zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+
+ /*
+ * If kswapd scans pages marked marked for immediate
+ * reclaim and under writeback (nr_immediate), it implies
+ * that pages are cycling through the LRU faster than
+ * they are written so also forcibly stall.
+ */
+ if (nr_immediate && current_may_throttle())
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }
+
+ /*
+ * Stall direct reclaim for IO completions if underlying BDIs or zone
+ * is congested. Allow kswapd to continue until it starts encountering
+ * unqueued dirty pages or cycling through the LRU too quickly.
+ */
+ if (!sc->hibernation_mode && !current_is_kswapd() &&
+ current_may_throttle())
+ wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
+ trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+ zone_idx(zone),
+ nr_scanned, nr_reclaimed,
+ sc->priority,
+ trace_shrink_flags(file));
+ return nr_reclaimed;
}
/*
@@ -739,199 +1622,708 @@ static inline int zone_is_near_oom(struct zone *zone)
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
- struct scan_control *sc)
+
+static void move_active_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list,
+ struct list_head *pages_to_free,
+ enum lru_list lru)
{
- unsigned long pgmoved;
- int pgdeactivate = 0;
- unsigned long pgscanned;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
- LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
- LIST_HEAD(l_active); /* Pages to go onto the active_list */
+ struct zone *zone = lruvec_zone(lruvec);
+ unsigned long pgmoved = 0;
struct page *page;
- struct pagevec pvec;
- int reclaim_mapped = 0;
+ int nr_pages;
- if (sc->may_swap) {
- long mapped_ratio;
- long distress;
- long swap_tendency;
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
+ lruvec = mem_cgroup_page_lruvec(page, zone);
- if (zone_is_near_oom(zone))
- goto force_reclaim_mapped;
-
- /*
- * `distress' is a measure of how much trouble we're having
- * reclaiming pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ SetPageLRU(page);
- /*
- * The point of this algorithm is to decide when to start
- * reclaiming mapped memory instead of just pagecache. Work out
- * how much memory
- * is mapped.
- */
- mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
- global_page_state(NR_ANON_PAGES)) * 100) /
- vm_total_pages;
+ nr_pages = hpage_nr_pages(page);
+ mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+ pgmoved += nr_pages;
- /*
- * Now decide how much we really want to unmap some pages. The
- * mapped ratio is downgraded - just because there's a lot of
- * mapped memory doesn't necessarily mean that page reclaim
- * isn't succeeding.
- *
- * The distress ratio is important - we don't want to start
- * going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm
- * altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+ if (put_page_testzero(page)) {
+ __ClearPageLRU(page);
+ __ClearPageActive(page);
+ del_page_from_lru_list(page, lruvec, lru);
- /*
- * Now use this metric to decide whether to start moving mapped
- * memory onto the inactive list.
- */
- if (swap_tendency >= 100)
-force_reclaim_mapped:
- reclaim_mapped = 1;
+ if (unlikely(PageCompound(page))) {
+ spin_unlock_irq(&zone->lru_lock);
+ (*get_compound_page_dtor(page))(page);
+ spin_lock_irq(&zone->lru_lock);
+ } else
+ list_add(&page->lru, pages_to_free);
+ }
}
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+ if (!is_active_lru(lru))
+ __count_vm_events(PGDEACTIVATE, pgmoved);
+}
+
+static void shrink_active_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec,
+ struct scan_control *sc,
+ enum lru_list lru)
+{
+ unsigned long nr_taken;
+ unsigned long nr_scanned;
+ unsigned long vm_flags;
+ LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_active);
+ LIST_HEAD(l_inactive);
+ struct page *page;
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ unsigned long nr_rotated = 0;
+ isolate_mode_t isolate_mode = 0;
+ int file = is_file_lru(lru);
+ struct zone *zone = lruvec_zone(lruvec);
lru_add_drain();
+
+ if (!sc->may_unmap)
+ isolate_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ isolate_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
- zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
+
+ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ &nr_scanned, sc, isolate_mode, lru);
+ if (global_reclaim(sc))
+ zone->pages_scanned += nr_scanned;
+
+ reclaim_stat->recent_scanned[file] += nr_taken;
+
+ __count_zone_vm_events(PGREFILL, zone, nr_scanned);
+ __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
- if (page_mapped(page)) {
- if (!reclaim_mapped ||
- (total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0)) {
+
+ if (unlikely(!page_evictable(page))) {
+ putback_lru_page(page);
+ continue;
+ }
+
+ if (unlikely(buffer_heads_over_limit)) {
+ if (page_has_private(page) && trylock_page(page)) {
+ if (page_has_private(page))
+ try_to_release_page(page, 0);
+ unlock_page(page);
+ }
+ }
+
+ if (page_referenced(page, 0, sc->target_mem_cgroup,
+ &vm_flags)) {
+ nr_rotated += hpage_nr_pages(page);
+ /*
+ * Identify referenced, file-backed active pages and
+ * give them one more trip around the active list. So
+ * that executable code get better chances to stay in
+ * memory under moderate memory pressure. Anon pages
+ * are not likely to be evicted by use-once streaming
+ * IO, plus JVM can create lots of anon VM_EXEC pages,
+ * so we ignore them here.
+ */
+ if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}
+
+ ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);
}
- pagevec_init(&pvec, 1);
- pgmoved = 0;
+ /*
+ * Move pages back to the lru list.
+ */
spin_lock_irq(&zone->lru_lock);
- while (!list_empty(&l_inactive)) {
- page = lru_to_page(&l_inactive);
- prefetchw_prev_lru_page(page, &l_inactive, flags);
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- VM_BUG_ON(!PageActive(page));
- ClearPageActive(page);
+ /*
+ * Count referenced pages from currently used mappings as rotated,
+ * even though only some of them are actually re-activated. This
+ * helps balance scan pressure between file and anonymous pages in
+ * get_scan_ratio.
+ */
+ reclaim_stat->recent_rotated[file] += nr_rotated;
- list_move(&page->lru, &zone->inactive_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pgdeactivate += pgmoved;
- pgmoved = 0;
- if (buffer_heads_over_limit)
- pagevec_strip(&pvec);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
+ move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+ move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+ spin_unlock_irq(&zone->lru_lock);
+
+ free_hot_cold_page_list(&l_hold, true);
+}
+
+#ifdef CONFIG_SWAP
+static int inactive_anon_is_low_global(struct zone *zone)
+{
+ unsigned long active, inactive;
+
+ active = zone_page_state(zone, NR_ACTIVE_ANON);
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+ if (inactive * zone->inactive_ratio < active)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @lruvec: LRU vector to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static int inactive_anon_is_low(struct lruvec *lruvec)
+{
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!total_swap_pages)
+ return 0;
+
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_inactive_anon_is_low(lruvec);
+
+ return inactive_anon_is_low_global(lruvec_zone(lruvec));
+}
+#else
+static inline int inactive_anon_is_low(struct lruvec *lruvec)
+{
+ return 0;
+}
+#endif
+
+/**
+ * inactive_file_is_low - check if file pages need to be deactivated
+ * @lruvec: LRU vector to check
+ *
+ * When the system is doing streaming IO, memory pressure here
+ * ensures that active file pages get deactivated, until more
+ * than half of the file pages are on the inactive list.
+ *
+ * Once we get to that situation, protect the system's working
+ * set from being evicted by disabling active file page aging.
+ *
+ * This uses a different ratio than the anonymous pages, because
+ * the page cache uses a use-once replacement algorithm.
+ */
+static int inactive_file_is_low(struct lruvec *lruvec)
+{
+ unsigned long inactive;
+ unsigned long active;
+
+ inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+
+ return active > inactive;
+}
+
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+{
+ if (is_file_lru(lru))
+ return inactive_file_is_low(lruvec);
+ else
+ return inactive_anon_is_low(lruvec);
+}
+
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc)
+{
+ if (is_active_lru(lru)) {
+ if (inactive_list_is_low(lruvec, lru))
+ shrink_active_list(nr_to_scan, lruvec, sc, lru);
+ return 0;
}
- zone->nr_inactive += pgmoved;
- pgdeactivate += pgmoved;
- if (buffer_heads_over_limit) {
- spin_unlock_irq(&zone->lru_lock);
- pagevec_strip(&pvec);
- spin_lock_irq(&zone->lru_lock);
+
+ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
+}
+
+enum scan_balance {
+ SCAN_EQUAL,
+ SCAN_FRACT,
+ SCAN_ANON,
+ SCAN_FILE,
+};
+
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned. The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
+ */
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long *nr)
+{
+ struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ u64 fraction[2];
+ u64 denominator = 0; /* gcc */
+ struct zone *zone = lruvec_zone(lruvec);
+ unsigned long anon_prio, file_prio;
+ enum scan_balance scan_balance;
+ unsigned long anon, file;
+ bool force_scan = false;
+ unsigned long ap, fp;
+ enum lru_list lru;
+ bool some_scanned;
+ int pass;
+
+ /*
+ * If the zone or memcg is small, nr[l] can be 0. This
+ * results in no scanning on this priority and a potential
+ * priority drop. Global direct reclaim can go to the next
+ * zone and tends to have no problems. Global kswapd is for
+ * zone balancing and it needs to scan a minimum amount. When
+ * reclaiming for a memcg, a priority drop can cause high
+ * latencies, so it's better to scan a minimum amount there as
+ * well.
+ */
+ if (current_is_kswapd() && !zone_reclaimable(zone))
+ force_scan = true;
+ if (!global_reclaim(sc))
+ force_scan = true;
+
+ /* If we have no swap space, do not bother scanning anon pages. */
+ if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ scan_balance = SCAN_FILE;
+ goto out;
}
- pgmoved = 0;
- while (!list_empty(&l_active)) {
- page = lru_to_page(&l_active);
- prefetchw_prev_lru_page(page, &l_active, flags);
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- VM_BUG_ON(!PageActive(page));
- list_move(&page->lru, &zone->active_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
- pgmoved = 0;
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
+ /*
+ * Global reclaim will swap to prevent OOM even with no
+ * swappiness, but memcg users want to use this knob to
+ * disable swapping for individual groups completely when
+ * using the memory controller's swap limit feature would be
+ * too expensive.
+ */
+ if (!global_reclaim(sc) && !sc->swappiness) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ /*
+ * Do not apply any pressure balancing cleverness when the
+ * system is close to OOM, scan both anon and file equally
+ * (unless the swappiness setting disagrees with swapping).
+ */
+ if (!sc->priority && sc->swappiness) {
+ scan_balance = SCAN_EQUAL;
+ goto out;
+ }
+
+ anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ get_lru_size(lruvec, LRU_INACTIVE_FILE);
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (global_reclaim(sc)) {
+ unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+
+ if (unlikely(file + free <= high_wmark_pages(zone))) {
+ scan_balance = SCAN_ANON;
+ goto out;
}
}
- zone->nr_active += pgmoved;
- __count_zone_vm_events(PGREFILL, zone, pgscanned);
- __count_vm_events(PGDEACTIVATE, pgdeactivate);
+ /*
+ * There is enough inactive page cache, do not reclaim
+ * anything from the anonymous working set right now.
+ */
+ if (!inactive_file_is_low(lruvec)) {
+ scan_balance = SCAN_FILE;
+ goto out;
+ }
+
+ scan_balance = SCAN_FRACT;
+
+ /*
+ * With swappiness at 100, anonymous and file have the same priority.
+ * This scanning priority is essentially the inverse of IO cost.
+ */
+ anon_prio = sc->swappiness;
+ file_prio = 200 - anon_prio;
+
+ /*
+ * OK, so we have swap space and a fair amount of page cache
+ * pages. We use the recently rotated / recently scanned
+ * ratios to determine how valuable each cache is.
+ *
+ * Because workloads change over time (and to avoid overflow)
+ * we keep these statistics as a floating average, which ends
+ * up weighing recent references more than old ones.
+ *
+ * anon in [0], file in [1]
+ */
+ spin_lock_irq(&zone->lru_lock);
+ if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
+ reclaim_stat->recent_scanned[0] /= 2;
+ reclaim_stat->recent_rotated[0] /= 2;
+ }
+
+ if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
+ reclaim_stat->recent_scanned[1] /= 2;
+ reclaim_stat->recent_rotated[1] /= 2;
+ }
+
+ /*
+ * The amount of pressure on anon vs file pages is inversely
+ * proportional to the fraction of recently scanned pages on
+ * each list that were recently referenced and in active use.
+ */
+ ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
+ ap /= reclaim_stat->recent_rotated[0] + 1;
+
+ fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
+ fp /= reclaim_stat->recent_rotated[1] + 1;
spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
+ fraction[0] = ap;
+ fraction[1] = fp;
+ denominator = ap + fp + 1;
+out:
+ some_scanned = false;
+ /* Only use force_scan on second pass. */
+ for (pass = 0; !some_scanned && pass < 2; pass++) {
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
+
+ size = get_lru_size(lruvec, lru);
+ scan = size >> sc->priority;
+
+ if (!scan && pass && force_scan)
+ scan = min(size, SWAP_CLUSTER_MAX);
+
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
+ /*
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
+ */
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file)
+ scan = 0;
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
+ }
+ nr[lru] = scan;
+ /*
+ * Skip the second pass and don't force_scan,
+ * if we found something to scan.
+ */
+ some_scanned |= !!scan;
+ }
+ }
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- unsigned long nr_active;
- unsigned long nr_inactive;
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
+ enum lru_list lru;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
+ bool scan_adjusted;
+
+ get_scan_count(lruvec, sc, nr);
- atomic_inc(&zone->reclaim_in_progress);
+ /* Record the original scan target for proportional adjustments later */
+ memcpy(targets, nr, sizeof(nr));
/*
- * Add one to `nr_to_scan' just to make sure that the kernel will
- * slowly sift through the active list.
+ * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+ * event that can occur when there is little memory pressure e.g.
+ * multiple streaming readers/writers. Hence, we do not abort scanning
+ * when the requested number of pages are reclaimed when scanning at
+ * DEF_PRIORITY on the assumption that the fact we are direct
+ * reclaiming implies that kswapd is not keeping up and it is best to
+ * do a batch of work at once. For memcg reclaim one check is made to
+ * abort proportional reclaim if either the file or anon lru has already
+ * dropped to zero at the first pass.
*/
- zone->nr_scan_active += (zone->nr_active >> priority) + 1;
- nr_active = zone->nr_scan_active;
- if (nr_active >= sc->swap_cluster_max)
- zone->nr_scan_active = 0;
- else
- nr_active = 0;
-
- zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
- nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= sc->swap_cluster_max)
- zone->nr_scan_inactive = 0;
- else
- nr_inactive = 0;
-
- while (nr_active || nr_inactive) {
- if (nr_active) {
- nr_to_scan = min(nr_active,
- (unsigned long)sc->swap_cluster_max);
- nr_active -= nr_to_scan;
- shrink_active_list(nr_to_scan, zone, sc);
+ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
+
+ blk_start_plug(&plug);
+ while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+ nr[LRU_INACTIVE_FILE]) {
+ unsigned long nr_anon, nr_file, percentage;
+ unsigned long nr_scanned;
+
+ for_each_evictable_lru(lru) {
+ if (nr[lru]) {
+ nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+ nr[lru] -= nr_to_scan;
+
+ nr_reclaimed += shrink_list(lru, nr_to_scan,
+ lruvec, sc);
+ }
}
- if (nr_inactive) {
- nr_to_scan = min(nr_inactive,
- (unsigned long)sc->swap_cluster_max);
- nr_inactive -= nr_to_scan;
- nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
- sc);
+ if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ continue;
+
+ /*
+ * For kswapd and memcg, reclaim at least the number of pages
+ * requested. Ensure that the anon and file LRUs are scanned
+ * proportionally what was requested by get_scan_count(). We
+ * stop reclaiming one LRU and reduce the amount scanning
+ * proportional to the original scan target.
+ */
+ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
+ nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+
+ /*
+ * It's just vindictive to attack the larger once the smaller
+ * has gone to zero. And given the way we stop scanning the
+ * smaller below, this makes sure that we only make one nudge
+ * towards proportionality once we've got nr_to_reclaim.
+ */
+ if (!nr_file || !nr_anon)
+ break;
+
+ if (nr_file > nr_anon) {
+ unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
+ targets[LRU_ACTIVE_ANON] + 1;
+ lru = LRU_BASE;
+ percentage = nr_anon * 100 / scan_target;
+ } else {
+ unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
+ targets[LRU_ACTIVE_FILE] + 1;
+ lru = LRU_FILE;
+ percentage = nr_file * 100 / scan_target;
}
+
+ /* Stop scanning the smaller of the LRU */
+ nr[lru] = 0;
+ nr[lru + LRU_ACTIVE] = 0;
+
+ /*
+ * Recalculate the other LRU scan count based on its original
+ * scan target and the percentage scanning already complete
+ */
+ lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
+ nr_scanned = targets[lru] - nr[lru];
+ nr[lru] = targets[lru] * (100 - percentage) / 100;
+ nr[lru] -= min(nr[lru], nr_scanned);
+
+ lru += LRU_ACTIVE;
+ nr_scanned = targets[lru] - nr[lru];
+ nr[lru] = targets[lru] * (100 - percentage) / 100;
+ nr[lru] -= min(nr[lru], nr_scanned);
+
+ scan_adjusted = true;
}
+ blk_finish_plug(&plug);
+ sc->nr_reclaimed += nr_reclaimed;
- throttle_vm_writeout();
+ /*
+ * Even if we did not try to evict anon pages at all, we want to
+ * rebalance the anon lru active/inactive ratio.
+ */
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
- atomic_dec(&zone->reclaim_in_progress);
- return nr_reclaimed;
+ throttle_vm_writeout(sc->gfp_mask);
+}
+
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(struct scan_control *sc)
+{
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+ sc->priority < DEF_PRIORITY - 2))
+ return true;
+
+ return false;
+}
+
+/*
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
+ * true if more pages should be reclaimed such that when the page allocator
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * It will give up earlier than that if there is difficulty reclaiming pages.
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ struct scan_control *sc)
+{
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
+
+ /* If not in reclaim/compaction mode, stop */
+ if (!in_reclaim_compaction(sc))
+ return false;
+
+ /* Consider stopping depending on scan and reclaim activity */
+ if (sc->gfp_mask & __GFP_REPEAT) {
+ /*
+ * For __GFP_REPEAT allocations, stop reclaiming if the
+ * full LRU list has been scanned and we are still failing
+ * to reclaim pages. This full LRU scan is potentially
+ * expensive but a __GFP_REPEAT caller really wants to succeed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+ } else {
+ /*
+ * For non-__GFP_REPEAT allocations which can presumably
+ * fail without consequence, stop if we failed to reclaim
+ * any pages from the last SWAP_CLUSTER_MAX number of
+ * pages that were scanned. This will return to the
+ * caller faster at the risk reclaim/compaction and
+ * the resulting allocation attempt fails
+ */
+ if (!nr_reclaimed)
+ return false;
+ }
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = (2UL << sc->order);
+ inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+ if (sc->nr_reclaimed < pages_for_compaction &&
+ inactive_lru_pages > pages_for_compaction)
+ return true;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ switch (compaction_suitable(zone, sc->order)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long nr_reclaimed, nr_scanned;
+
+ do {
+ struct mem_cgroup *root = sc->target_mem_cgroup;
+ struct mem_cgroup_reclaim_cookie reclaim = {
+ .zone = zone,
+ .priority = sc->priority,
+ };
+ struct mem_cgroup *memcg;
+
+ nr_reclaimed = sc->nr_reclaimed;
+ nr_scanned = sc->nr_scanned;
+
+ memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ do {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+ sc->swappiness = mem_cgroup_swappiness(memcg);
+ shrink_lruvec(lruvec, sc);
+
+ /*
+ * Direct reclaim and kswapd have to scan all memory
+ * cgroups to fulfill the overall scan target for the
+ * zone.
+ *
+ * Limit reclaim, on the other hand, only cares about
+ * nr_to_reclaim pages to be reclaimed and it will
+ * retry with decreasing priority if one round over the
+ * whole hierarchy is not sufficient.
+ */
+ if (!global_reclaim(sc) &&
+ sc->nr_reclaimed >= sc->nr_to_reclaim) {
+ mem_cgroup_iter_break(root, memcg);
+ break;
+ }
+ memcg = mem_cgroup_iter(root, memcg, &reclaim);
+ } while (memcg);
+
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
+ } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc));
+}
+
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long balance_gap, watermark;
+ bool watermark_ok;
+
+ /* Do not consider compaction for orders reclaim is meant to satisfy */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * Compaction takes time to run and there are potentially other
+ * callers using the pages just freed. Continue reclaiming until
+ * there is a buffer of free pages available to give compaction
+ * a reasonable chance of completing and allocating the page
+ */
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+ /*
+ * If compaction is deferred, reclaim up to a point where
+ * compaction will have a chance of success when re-enabled
+ */
+ if (compaction_deferred(zone, sc->order))
+ return watermark_ok;
+
+ /* If compaction is not ready to start, keep reclaiming */
+ if (!compaction_suitable(zone, sc->order))
+ return false;
+
+ return watermark_ok;
}
/*
@@ -939,47 +2331,143 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
- * We reclaim from a zone even if that zone is over pages_high. Because:
+ * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
+ * Because:
* a) The caller may be trying to free *extra* pages to satisfy a higher-order
* allocation or
- * b) The zones may be over pages_high but they must go *over* pages_high to
- * satisfy the `incremental min' zone defense algorithm.
- *
- * Returns the number of reclaimed pages.
+ * b) The target zone may be at high_wmark_pages(zone) but the lower zones
+ * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
+ * zone defense algorithm.
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
*/
-static unsigned long shrink_zones(int priority, struct zone **zones,
- struct scan_control *sc)
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
- unsigned long nr_reclaimed = 0;
- int i;
+ struct zoneref *z;
+ struct zone *zone;
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
+ unsigned long lru_pages = 0;
+ bool aborted_reclaim = false;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ gfp_t orig_mask;
+ struct shrink_control shrink = {
+ .gfp_mask = sc->gfp_mask,
+ };
+ enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+
+ /*
+ * If the number of buffer_heads in the machine exceeds the maximum
+ * allowed level, force direct reclaim to scan the highmem zone as
+ * highmem pages could be pinning lowmem pages storing buffer_heads
+ */
+ orig_mask = sc->gfp_mask;
+ if (buffer_heads_over_limit)
+ sc->gfp_mask |= __GFP_HIGHMEM;
- sc->all_unreclaimable = 1;
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
+ nodes_clear(shrink.nodes_to_scan);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
+ /*
+ * Take care memory controller reclaiming has small influence
+ * to global LRU.
+ */
+ if (global_reclaim(sc)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
- continue;
+ lru_pages += zone_reclaimable_pages(zone);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ if (sc->priority != DEF_PRIORITY &&
+ !zone_reclaimable(zone))
+ continue; /* Let kswapd poll it */
+ if (IS_ENABLED(CONFIG_COMPACTION)) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticeable problem, like transparent huge
+ * page allocations.
+ */
+ if ((zonelist_zone_idx(z) <= requested_highidx)
+ && compaction_ready(zone, sc)) {
+ aborted_reclaim = true;
+ continue;
+ }
+ }
+ /*
+ * This steals pages from memory cgroups over softlimit
+ * and returns the number of reclaimed pages and
+ * scanned pages. This works for global memory pressure
+ * and balancing, not for a memcg's limit.
+ */
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ sc->order, sc->gfp_mask,
+ &nr_soft_scanned);
+ sc->nr_reclaimed += nr_soft_reclaimed;
+ sc->nr_scanned += nr_soft_scanned;
+ /* need some check for avoid more shrink_zone() */
+ }
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
- continue; /* Let kswapd poll it */
+ shrink_zone(zone, sc);
+ }
- sc->all_unreclaimable = 0;
+ /*
+ * Don't shrink slabs when reclaiming memory from over limit cgroups
+ * but do shrink slab at least once when aborting reclaim for
+ * compaction to avoid unevenly scanning file/anon LRU pages over slab
+ * pages.
+ */
+ if (global_reclaim(sc)) {
+ shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
+ }
+
+ /*
+ * Restore to original mask to avoid the impact on the caller if we
+ * promoted it to __GFP_HIGHMEM.
+ */
+ sc->gfp_mask = orig_mask;
+
+ return aborted_reclaim;
+}
- nr_reclaimed += shrink_zone(priority, zone, sc);
+/* All zones in zonelist are unreclaimable? */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone))
+ return false;
}
- return nr_reclaimed;
+
+ return true;
}
-
+
/*
* This is the main entry point to direct page reclaim.
*
@@ -988,55 +2476,42 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
*
* If the caller is !__GFP_FS then the probability of a failure is reasonably
* high - the zone may be full of dirty or under-writeback pages, which this
- * caller can't do much about. We kick pdflush and take explicit naps in the
- * hope that some of these pages can be written. But if the allocating task
- * holds filesystem locks which prevent writeout this might not work, and the
- * allocation attempt will fail.
+ * caller can't do much about. We kick the writeback threads and take explicit
+ * naps in the hope that some of these pages can be written. But if the
+ * allocating task holds filesystem locks which prevent writeout this might not
+ * work, and the allocation attempt will fail.
+ *
+ * returns: 0, if no pages reclaimed
+ * else, the number of pages reclaimed
*/
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ struct scan_control *sc)
{
- int priority;
- int ret = 0;
unsigned long total_scanned = 0;
- unsigned long nr_reclaimed = 0;
- struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
- int i;
- struct scan_control sc = {
- .gfp_mask = gfp_mask,
- .may_writepage = !laptop_mode,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
- .may_swap = 1,
- .swappiness = vm_swappiness,
- };
+ unsigned long writeback_threshold;
+ bool aborted_reclaim;
- count_vm_event(ALLOCSTALL);
+ delayacct_freepages_start();
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
+ if (global_reclaim(sc))
+ count_vm_event(ALLOCSTALL);
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
- continue;
-
- zone->temp_priority = DEF_PRIORITY;
- lru_pages += zone->nr_active + zone->nr_inactive;
- }
+ do {
+ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->priority);
+ sc->nr_scanned = 0;
+ aborted_reclaim = shrink_zones(zonelist, sc);
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc.nr_scanned = 0;
- if (!priority)
- disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zones, &sc);
- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
- if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
- total_scanned += sc.nr_scanned;
- if (nr_reclaimed >= sc.swap_cluster_max) {
- ret = 1;
+ total_scanned += sc->nr_scanned;
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim)
goto out;
- }
+
+ /*
+ * If we're getting trouble reclaiming, start doing
+ * writepage even in laptop mode.
+ */
+ if (sc->priority < DEF_PRIORITY - 2)
+ sc->may_writepage = 1;
/*
* Try to write back as many pages as we just scanned. This
@@ -1045,36 +2520,492 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc.swap_cluster_max +
- sc.swap_cluster_max / 2) {
- wakeup_pdflush(laptop_mode ? 0 : total_scanned);
- sc.may_writepage = 1;
+ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
+ if (total_scanned > writeback_threshold) {
+ wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+ WB_REASON_TRY_TO_FREE_PAGES);
+ sc->may_writepage = 1;
}
+ } while (--sc->priority >= 0 && !aborted_reclaim);
+
+out:
+ delayacct_freepages_end();
+
+ if (sc->nr_reclaimed)
+ return sc->nr_reclaimed;
+
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (aborted_reclaim)
+ return 1;
+
+ /* top priority shrink_zones still had more to do? don't OOM, then */
+ if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
+ return 1;
+
+ return 0;
+}
+
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ unsigned long pfmemalloc_reserve = 0;
+ unsigned long free_pages = 0;
+ int i;
+ bool wmark_ok;
- /* Take a nap, wait for some writeback to complete */
- if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ for (i = 0; i <= ZONE_NORMAL; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!populated_zone(zone))
+ continue;
+
+ pfmemalloc_reserve += min_wmark_pages(zone);
+ free_pages += zone_page_state(zone, NR_FREE_PAGES);
}
- /* top priority shrink_caches still had more to do? don't OOM, then */
- if (!sc.all_unreclaimable)
- ret = 1;
+
+ /* If there are no reserves (unexpected config) then do not throttle */
+ if (!pfmemalloc_reserve)
+ return true;
+
+ wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+ /* kswapd must be awake if processes are being throttled */
+ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+ pgdat->classzone_idx = min(pgdat->classzone_idx,
+ (enum zone_type)ZONE_NORMAL);
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+ return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
+ */
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ pg_data_t *pgdat = NULL;
+
+ /*
+ * Kernel threads should not be throttled as they may be indirectly
+ * responsible for cleaning pages necessary for reclaim to make forward
+ * progress. kjournald for example may enter direct reclaim while
+ * committing a transaction where throttling it could forcing other
+ * processes to block on log_wait_commit().
+ */
+ if (current->flags & PF_KTHREAD)
+ goto out;
+
+ /*
+ * If a fatal signal is pending, this process should not throttle.
+ * It should return quickly so it can exit and free its memory
+ */
+ if (fatal_signal_pending(current))
+ goto out;
+
+ /*
+ * Check if the pfmemalloc reserves are ok by finding the first node
+ * with a usable ZONE_NORMAL or lower zone. The expectation is that
+ * GFP_KERNEL will be required for allocating network buffers when
+ * swapping over the network so ZONE_HIGHMEM is unusable.
+ *
+ * Throttling is based on the first usable node and throttled processes
+ * wait on a queue until kswapd makes progress and wakes them. There
+ * is an affinity then between processes waking up and where reclaim
+ * progress has been made assuming the process wakes on the same node.
+ * More importantly, processes running on remote nodes will not compete
+ * for remote pfmemalloc reserves and processes on different nodes
+ * should make reasonable progress.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_mask, nodemask) {
+ if (zone_idx(zone) > ZONE_NORMAL)
+ continue;
+
+ /* Throttle based on the first usable node */
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ goto out;
+ break;
+ }
+
+ /* If no zone was usable by the allocation flags then do not throttle */
+ if (!pgdat)
+ goto out;
+
+ /* Account for the throttling */
+ count_vm_event(PGSCAN_DIRECT_THROTTLE);
+
+ /*
+ * If the caller cannot enter the filesystem, it's possible that it
+ * is due to the caller holding an FS lock or performing a journal
+ * transaction in the case of a filesystem like ext[3|4]. In this case,
+ * it is not safe to block on pfmemalloc_wait as kswapd could be
+ * blocked waiting on the same lock. Instead, throttle for up to a
+ * second before continuing.
+ */
+ if (!(gfp_mask & __GFP_FS)) {
+ wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat), HZ);
+
+ goto check_pending;
+ }
+
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat));
+
+check_pending:
+ if (fatal_signal_pending(current))
+ return true;
+
out:
- for (i = 0; zones[i] != 0; i++) {
- struct zone *zone = zones[i];
+ return false;
+}
+
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+ gfp_t gfp_mask, nodemask_t *nodemask)
+{
+ unsigned long nr_reclaimed;
+ struct scan_control sc = {
+ .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .may_writepage = !laptop_mode,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .order = order,
+ .priority = DEF_PRIORITY,
+ .target_mem_cgroup = NULL,
+ .nodemask = nodemask,
+ };
+
+ /*
+ * Do not enter reclaim if fatal signal was delivered while throttled.
+ * 1 is returned so that the page allocator does not OOM kill at this
+ * point.
+ */
+ if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+ return 1;
+
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
+}
+
+#ifdef CONFIG_MEMCG
+
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
+ gfp_t gfp_mask, bool noswap,
+ struct zone *zone,
+ unsigned long *nr_scanned)
+{
+ struct scan_control sc = {
+ .nr_scanned = 0,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = !noswap,
+ .order = 0,
+ .priority = 0,
+ .swappiness = mem_cgroup_swappiness(memcg),
+ .target_mem_cgroup = memcg,
+ };
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+ sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
+ sc.may_writepage,
+ sc.gfp_mask);
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ /*
+ * NOTE: Although we can get the priority field, using it
+ * here is not a good idea, since it limits the pages we can scan.
+ * if we don't reclaim here, the shrink_zone from balance_pgdat
+ * will pick up pages from other mem cgroup's as well. We hack
+ * the priority and make it zero.
+ */
+ shrink_lruvec(lruvec, &sc);
+
+ trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+
+ *nr_scanned = sc.nr_scanned;
+ return sc.nr_reclaimed;
+}
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ bool noswap)
+{
+ struct zonelist *zonelist;
+ unsigned long nr_reclaimed;
+ int nid;
+ struct scan_control sc = {
+ .may_writepage = !laptop_mode,
+ .may_unmap = 1,
+ .may_swap = !noswap,
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .order = 0,
+ .priority = DEF_PRIORITY,
+ .target_mem_cgroup = memcg,
+ .nodemask = NULL, /* we don't care the placement */
+ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ };
+
+ /*
+ * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
+ * take care of from where we get pages. So the node where we start the
+ * scan does not need to be the current node.
+ */
+ nid = mem_cgroup_select_victim_node(memcg);
+
+ zonelist = NODE_DATA(nid)->node_zonelists;
+
+ trace_mm_vmscan_memcg_reclaim_begin(0,
+ sc.may_writepage,
+ sc.gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
+}
+#endif
+
+static void age_active_anon(struct zone *zone, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+
+ if (!total_swap_pages)
+ return;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+
+ if (inactive_anon_is_low(lruvec))
+ shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ sc, LRU_ACTIVE_ANON);
+
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ } while (memcg);
+}
+
+static bool zone_balanced(struct zone *zone, int order,
+ unsigned long balance_gap, int classzone_idx)
+{
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+ balance_gap, classzone_idx, 0))
+ return false;
+
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+ !compaction_suitable(zone, order))
+ return false;
+
+ return true;
+}
+
+/*
+ * pgdat_balanced() is used when checking if a node is balanced.
+ *
+ * For order-0, all zones must be balanced!
+ *
+ * For high-order allocations only zones that meet watermarks and are in a
+ * zone allowed by the callers classzone_idx are added to balanced_pages. The
+ * total of balanced pages must be at least 25% of the zones allowed by
+ * classzone_idx for the node to be considered balanced. Forcing all zones to
+ * be balanced for high orders can cause excessive reclaim when there are
+ * imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * percentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ unsigned long managed_pages = 0;
+ unsigned long balanced_pages = 0;
+ int i;
+
+ /* Check the watermark levels */
+ for (i = 0; i <= classzone_idx; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
continue;
- zone->prev_priority = zone->temp_priority;
+ managed_pages += zone->managed_pages;
+
+ /*
+ * A special case here:
+ *
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well!
+ */
+ if (!zone_reclaimable(zone)) {
+ balanced_pages += zone->managed_pages;
+ continue;
+ }
+
+ if (zone_balanced(zone, order, 0, i))
+ balanced_pages += zone->managed_pages;
+ else if (!order)
+ return false;
}
- return ret;
+
+ if (order)
+ return balanced_pages >= (managed_pages >> 2);
+ else
+ return true;
+}
+
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
+{
+ /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
+ if (remaining)
+ return false;
+
+ /*
+ * There is a potential race between when kswapd checks its watermarks
+ * and a process gets throttled. There is also a potential race if
+ * processes get throttled, kswapd wakes, a large process exits therby
+ * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+ * is going to sleep, no process should be sleeping on pfmemalloc_wait
+ * so wake them now if necessary. If necessary, processes will wake
+ * kswapd and get throttled again
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+ wake_up(&pgdat->pfmemalloc_wait);
+ return false;
+ }
+
+ return pgdat_balanced(pgdat, order, classzone_idx);
+}
+
+/*
+ * kswapd shrinks the zone by the number of pages required to reach
+ * the high watermark.
+ *
+ * Returns true if kswapd scanned at least the requested number of pages to
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
+ */
+static bool kswapd_shrink_zone(struct zone *zone,
+ int classzone_idx,
+ struct scan_control *sc,
+ unsigned long lru_pages,
+ unsigned long *nr_attempted)
+{
+ int testorder = sc->order;
+ unsigned long balance_gap;
+ struct reclaim_state *reclaim_state = current->reclaim_state;
+ struct shrink_control shrink = {
+ .gfp_mask = sc->gfp_mask,
+ };
+ bool lowmem_pressure;
+
+ /* Reclaim above the high watermark. */
+ sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+
+ /*
+ * Kswapd reclaims only single pages with compaction enabled. Trying
+ * too hard to reclaim until contiguous free pages have become
+ * available can hurt performance by evicting too much useful data
+ * from memory. Do not reclaim more than needed for compaction.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ compaction_suitable(zone, sc->order) !=
+ COMPACT_SKIPPED)
+ testorder = 0;
+
+ /*
+ * We put equal pressure on every zone, unless one zone has way too
+ * many pages free already. The "too many pages" is defined as the
+ * high wmark plus a "gap" where the gap is either the low
+ * watermark or 1% of the zone, whichever is smaller.
+ */
+ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
+ zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+
+ /*
+ * If there is no low memory pressure or the zone is balanced then no
+ * reclaim is necessary
+ */
+ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+ if (!lowmem_pressure && zone_balanced(zone, testorder,
+ balance_gap, classzone_idx))
+ return true;
+
+ shrink_zone(zone, sc);
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+
+ reclaim_state->reclaimed_slab = 0;
+ shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+
+ /* Account for the number of pages attempted to reclaim */
+ *nr_attempted += sc->nr_to_reclaim;
+
+ zone_clear_flag(zone, ZONE_WRITEBACK);
+
+ /*
+ * If a zone reaches its high watermark, consider it to be no longer
+ * congested. It's possible there are dirty pages backed by congested
+ * BDIs but as pressure is relieved, speculatively avoid congestion
+ * waits.
+ */
+ if (zone_reclaimable(zone) &&
+ zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ }
+
+ return sc->nr_scanned >= sc->nr_to_reclaim;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at pages_high.
+ * they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -1085,48 +3016,37 @@ out:
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
- * zones which have free_pages > pages_high, but once a zone is found to have
- * free_pages <= pages_high, we scan that zone and the lower zones regardless
- * of the number of free pages in the lower zones. This interoperates with
- * the page allocator fallback scheme to ensure that aging of pages is balanced
- * across the zones.
+ * zones which have free_pages > high_wmark_pages(zone), but once a zone is
+ * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
+ * lower zones regardless of the number of free pages in the lower zones. This
+ * interoperates with the page allocator fallback scheme to ensure that aging
+ * of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+ int *classzone_idx)
{
- int all_zones_ok;
- int priority;
int i;
- unsigned long total_scanned;
- unsigned long nr_reclaimed;
- struct reclaim_state *reclaim_state = current->reclaim_state;
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
+ unsigned long nr_soft_reclaimed;
+ unsigned long nr_soft_scanned;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
.may_swap = 1,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
- .swappiness = vm_swappiness,
+ .may_writepage = !laptop_mode,
+ .order = order,
+ .target_mem_cgroup = NULL,
};
-
-loop_again:
- total_scanned = 0;
- nr_reclaimed = 0;
- sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->temp_priority = DEF_PRIORITY;
- }
-
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
+ do {
unsigned long lru_pages = 0;
+ unsigned long nr_attempted = 0;
+ bool raise_priority = true;
+ bool pgdat_needs_compaction = (order > 0);
- /* The swap token gets in the way of swapout... */
- if (!priority)
- disable_swap_token();
-
- all_zones_ok = 1;
+ sc.nr_reclaimed = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -1138,24 +3058,71 @@ loop_again:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (sc.priority != DEF_PRIORITY &&
+ !zone_reclaimable(zone))
continue;
- if (!zone_watermark_ok(zone, order, zone->pages_high,
- 0, 0)) {
+ /*
+ * Do some background aging of the anon list, to give
+ * pages a chance to be referenced before reclaiming.
+ */
+ age_active_anon(zone, &sc);
+
+ /*
+ * If the number of buffer_heads in the machine
+ * exceeds the maximum allowed level and this node
+ * has a highmem zone, force kswapd to reclaim from
+ * it to relieve lowmem pressure.
+ */
+ if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
- goto scan;
+ break;
+ }
+
+ if (!zone_balanced(zone, order, 0, 0)) {
+ end_zone = i;
+ break;
+ } else {
+ /*
+ * If balanced, clear the dirty and congested
+ * flags
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
}
}
- goto out;
-scan:
+
+ if (i < 0)
+ goto out;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ if (!populated_zone(zone))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+
+ /*
+ * If any zone is currently balanced then kswapd will
+ * not call compaction as it is expected that the
+ * necessary pages are already available.
+ */
+ if (pgdat_needs_compaction &&
+ zone_watermark_ok(zone, order,
+ low_wmark_pages(zone),
+ *classzone_idx, 0))
+ pgdat_needs_compaction = false;
}
/*
+ * If we're getting trouble reclaiming, start doing writepage
+ * even in laptop mode.
+ */
+ if (sc.priority < DEF_PRIORITY - 2)
+ sc.may_writepage = 1;
+
+ /*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
@@ -1166,76 +3133,145 @@ scan:
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- int nr_slab;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (sc.priority != DEF_PRIORITY &&
+ !zone_reclaimable(zone))
continue;
- if (!zone_watermark_ok(zone, order, zone->pages_high,
- end_zone, 0))
- all_zones_ok = 0;
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
sc.nr_scanned = 0;
- nr_reclaimed += shrink_zone(priority, zone, &sc);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- nr_reclaimed += reclaim_state->reclaimed_slab;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 && zone->pages_scanned >=
- (zone->nr_active + zone->nr_inactive) * 6)
- zone->all_unreclaimable = 1;
+
+ nr_soft_scanned = 0;
+ /*
+ * Call soft limit reclaim before calling shrink_zone.
+ */
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ order, sc.gfp_mask,
+ &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+
/*
- * If we've done a decent amount of scanning and
- * the reclaim ratio is low, start doing writepage
- * even in laptop mode
+ * There should be no need to raise the scanning
+ * priority if enough pages are already being scanned
+ * that that high watermark would be met at 100%
+ * efficiency.
*/
- if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > nr_reclaimed + nr_reclaimed / 2)
- sc.may_writepage = 1;
+ if (kswapd_shrink_zone(zone, end_zone, &sc,
+ lru_pages, &nr_attempted))
+ raise_priority = false;
}
- if (all_zones_ok)
- break; /* kswapd: all done */
+
/*
- * OK, kswapd is getting into trouble. Take a nap, then take
- * another pass across the zones.
+ * If the low watermark is met there is no need for processes
+ * to be throttled on pfmemalloc_wait as they should not be
+ * able to safely make forward progress. Wake them
*/
- if (total_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+ pfmemalloc_watermark_ok(pgdat))
+ wake_up(&pgdat->pfmemalloc_wait);
/*
- * We do this so kswapd doesn't build up large priorities for
- * example when it is freeing in parallel with allocators. It
- * matches the direct reclaim path behaviour in terms of impact
- * on zone->*_priority.
+ * Fragmentation may mean that the system cannot be rebalanced
+ * for high-order allocations in all zones. If twice the
+ * allocation size has been reclaimed and the zones are still
+ * not balanced then recheck the watermarks at order-0 to
+ * prevent kswapd reclaiming excessively. Assume that a
+ * process requested a high-order can direct reclaim/compact.
*/
- if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ if (order && sc.nr_reclaimed >= 2UL << order)
+ order = sc.order = 0;
+
+ /* Check if kswapd should be suspending */
+ if (try_to_freeze() || kthread_should_stop())
break;
- }
+
+ /*
+ * Compact if necessary and kswapd is reclaiming at least the
+ * high watermark number of pages as requsted
+ */
+ if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
+ compact_pgdat(pgdat, order);
+
+ /*
+ * Raise priority if scanning rate is too low or there was no
+ * progress in reclaiming pages
+ */
+ if (raise_priority || !sc.nr_reclaimed)
+ sc.priority--;
+ } while (sc.priority >= 1 &&
+ !pgdat_balanced(pgdat, order, *classzone_idx));
+
out:
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
+ /*
+ * Return the order we were reclaiming at so prepare_kswapd_sleep()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
+}
- zone->prev_priority = zone->temp_priority;
- }
- if (!all_zones_ok) {
- cond_resched();
- goto loop_again;
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /* Try to sleep for a short interval */
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
- return nr_reclaimed;
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
+ if (!kthread_should_stop())
+ schedule();
+
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
* The background pageout daemon, started as a kernel thread
- * from the init process.
+ * from the init process.
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
@@ -1248,18 +3284,22 @@ out:
*/
static int kswapd(void *p)
{
- unsigned long order;
+ unsigned long order, new_order;
+ unsigned balanced_order;
+ int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
+
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
- cpumask_t cpumask;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- cpumask = node_to_cpumask(pgdat->node_id);
- if (!cpus_empty(cpumask))
- set_cpus_allowed(tsk, cpumask);
+ lockdep_set_current_reclaim_state(GFP_KERNEL);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
/*
@@ -1275,229 +3315,159 @@ static int kswapd(void *p)
* trying to free the first piece of memory in the first place).
*/
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ set_freezable();
- order = 0;
+ order = new_order = 0;
+ balanced_order = 0;
+ classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
- unsigned long new_order;
+ bool ret;
- try_to_freeze();
+ /*
+ * If the last balance_pgdat was unsuccessful it's unlikely a
+ * new request of a similar or harder type will succeed soon
+ * so consider going to sleep on the basis we reclaimed at
+ */
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
+ }
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- new_order = pgdat->kswapd_max_order;
- pgdat->kswapd_max_order = 0;
- if (order < new_order) {
+ if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
- * allocation
+ * allocation or has tigher zone constraints
*/
order = new_order;
+ classzone_idx = new_classzone_idx;
} else {
- schedule();
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
+ classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
}
- finish_wait(&pgdat->kswapd_wait, &wait);
- balance_pgdat(pgdat, order);
+ ret = try_to_freeze();
+ if (kthread_should_stop())
+ break;
+
+ /*
+ * We can speed up thawing tasks if we don't call balance_pgdat
+ * after returning from the refrigerator
+ */
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
+ }
}
+
+ tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+ current->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+
return 0;
}
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
- if (pgdat->kswapd_max_order < order)
+ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order) {
pgdat->kswapd_max_order = order;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
- return;
+ pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+ }
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- wake_up_interruptible(&pgdat->kswapd_wait);
-}
-
-#ifdef CONFIG_PM
-/*
- * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
- * from LRU lists system-wide, for given pass and priority, and returns the
- * number of reclaimed pages
- *
- * For pass > 3 we also try to shrink the LRU lists that contain a few pages
- */
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
- int prio, struct scan_control *sc)
-{
- struct zone *zone;
- unsigned long nr_to_scan, ret = 0;
-
- for_each_zone(zone) {
-
- if (!populated_zone(zone))
- continue;
-
- if (zone->all_unreclaimable && prio != DEF_PRIORITY)
- continue;
-
- /* For pass = 0 we don't shrink the active list */
- if (pass > 0) {
- zone->nr_scan_active += (zone->nr_active >> prio) + 1;
- if (zone->nr_scan_active >= nr_pages || pass > 3) {
- zone->nr_scan_active = 0;
- nr_to_scan = min(nr_pages, zone->nr_active);
- shrink_active_list(nr_to_scan, zone, sc);
- }
- }
-
- zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
- if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
- zone->nr_scan_inactive = 0;
- nr_to_scan = min(nr_pages, zone->nr_inactive);
- ret += shrink_inactive_list(nr_to_scan, zone, sc);
- if (ret >= nr_pages)
- return ret;
- }
- }
+ if (zone_balanced(zone, order, 0, 0))
+ return;
- return ret;
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ wake_up_interruptible(&pgdat->kswapd_wait);
}
+#ifdef CONFIG_HIBERNATION
/*
- * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
* freed pages.
*
* Rather than trying to age LRUs the aim is to preserve the overall
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
*/
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- unsigned long lru_pages, nr_slab;
- unsigned long ret = 0;
- int pass;
struct reclaim_state reclaim_state;
- struct zone *zone;
struct scan_control sc = {
- .gfp_mask = GFP_KERNEL,
- .may_swap = 0,
- .swap_cluster_max = nr_pages,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE,
+ .may_swap = 1,
+ .may_unmap = 1,
.may_writepage = 1,
- .swappiness = vm_swappiness,
+ .nr_to_reclaim = nr_to_reclaim,
+ .hibernation_mode = 1,
+ .order = 0,
+ .priority = DEF_PRIORITY,
};
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+ struct task_struct *p = current;
+ unsigned long nr_reclaimed;
- current->reclaim_state = &reclaim_state;
-
- lru_pages = 0;
- for_each_zone(zone)
- lru_pages += zone->nr_active + zone->nr_inactive;
-
- nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
- /* If slab caches are huge, it's better to hit them first */
- while (nr_slab >= lru_pages) {
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
- if (!reclaim_state.reclaimed_slab)
- break;
-
- ret += reclaim_state.reclaimed_slab;
- if (ret >= nr_pages)
- goto out;
-
- nr_slab -= reclaim_state.reclaimed_slab;
- }
-
- /*
- * We try to shrink LRUs in 5 passes:
- * 0 = Reclaim from inactive_list only
- * 1 = Reclaim from active list but don't reclaim mapped
- * 2 = 2nd pass of type 1
- * 3 = Reclaim mapped (normal reclaim)
- * 4 = 2nd pass of type 3
- */
- for (pass = 0; pass < 5; pass++) {
- int prio;
-
- /* Needed for shrinking slab caches later on */
- if (!lru_pages)
- for_each_zone(zone) {
- lru_pages += zone->nr_active;
- lru_pages += zone->nr_inactive;
- }
-
- /* Force reclaiming mapped pages in the passes #3 and #4 */
- if (pass > 2) {
- sc.may_swap = 1;
- sc.swappiness = 100;
- }
-
- for (prio = DEF_PRIORITY; prio >= 0; prio--) {
- unsigned long nr_to_scan = nr_pages - ret;
-
- sc.nr_scanned = 0;
- ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
- if (ret >= nr_pages)
- goto out;
-
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
- ret += reclaim_state.reclaimed_slab;
- if (ret >= nr_pages)
- goto out;
-
- if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ / 10);
- }
-
- lru_pages = 0;
- }
+ p->flags |= PF_MEMALLOC;
+ lockdep_set_current_reclaim_state(sc.gfp_mask);
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
- /*
- * If ret = 0, we could not shrink LRUs, but there may be something
- * in slab caches
- */
- if (!ret)
- do {
- reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
- ret += reclaim_state.reclaimed_slab;
- } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-out:
- current->reclaim_state = NULL;
+ p->reclaim_state = NULL;
+ lockdep_clear_current_reclaim_state();
+ p->flags &= ~PF_MEMALLOC;
- return ret;
+ return nr_reclaimed;
}
-#endif
+#endif /* CONFIG_HIBERNATION */
-#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
-static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
- pg_data_t *pgdat;
- cpumask_t mask;
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
- if (action == CPU_ONLINE) {
- for_each_online_pgdat(pgdat) {
- mask = node_to_cpumask(pgdat->node_id);
- if (any_online_cpu(mask) != NR_CPUS)
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed(pgdat->kswapd, mask);
+ set_cpus_allowed_ptr(pgdat->kswapd, mask);
}
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
/*
* This kswapd start function will be called by init and node-hot-add.
@@ -1515,18 +3485,33 @@ int kswapd_run(int nid)
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
- printk("Failed to start kswapd on node %d\n",nid);
- ret = -1;
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kswapd);
+ pgdat->kswapd = NULL;
}
return ret;
}
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kswapd_stop(int nid)
+{
+ struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+
+ if (kswapd) {
+ kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
+}
+
static int __init kswapd_init(void)
{
int nid;
swap_setup();
- for_each_online_node(nid)
+ for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
@@ -1544,7 +3529,7 @@ module_init(kswapd_init)
int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
@@ -1567,6 +3552,48 @@ int sysctl_min_unmapped_ratio = 1;
*/
int sysctl_min_slab_ratio = 5;
+static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+{
+ unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
+ unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_FILE);
+
+ /*
+ * It's possible for there to be more file mapped pages than
+ * accounted for by the pages on the file LRU lists because
+ * tmpfs pages accounted for as ANON can also be FILE_MAPPED
+ */
+ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
+}
+
+/* Work out how many page cache pages we can reclaim in this reclaim_mode */
+static long zone_pagecache_reclaimable(struct zone *zone)
+{
+ long nr_pagecache_reclaimable;
+ long delta = 0;
+
+ /*
+ * If RECLAIM_SWAP is set, then all file pages are considered
+ * potentially reclaimable. Otherwise, we have to worry about
+ * pages like swapcache and zone_unmapped_file_pages() provides
+ * a better estimate
+ */
+ if (zone_reclaim_mode & RECLAIM_SWAP)
+ nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+ else
+ nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+
+ /* If we can't clean pages, remove dirty pages from consideration */
+ if (!(zone_reclaim_mode & RECLAIM_WRITE))
+ delta += zone_page_state(zone, NR_FILE_DIRTY);
+
+ /* Watch for any possible underflows due to delta */
+ if (unlikely(delta > nr_pagecache_reclaimable))
+ delta = nr_pagecache_reclaimable;
+
+ return nr_pagecache_reclaimable - delta;
+}
+
/*
* Try to free up some pages from this zone through reclaim.
*/
@@ -1576,19 +3603,20 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- int priority;
- unsigned long nr_reclaimed = 0;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
- .swap_cluster_max = max_t(unsigned long, nr_pages,
- SWAP_CLUSTER_MAX),
- .gfp_mask = gfp_mask,
- .swappiness = vm_swappiness,
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_swap = 1,
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .order = order,
+ .priority = ZONE_RECLAIM_PRIORITY,
};
- unsigned long slab_reclaimable;
+ struct shrink_control shrink = {
+ .gfp_mask = sc.gfp_mask,
+ };
+ unsigned long nr_slab_pages0, nr_slab_pages1;
- disable_swap_token();
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1596,57 +3624,64 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* and RECLAIM_SWAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_page_state(zone, NR_FILE_PAGES) -
- zone_page_state(zone, NR_FILE_MAPPED) >
- zone->min_unmapped_pages) {
+ if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
- priority = ZONE_RECLAIM_PRIORITY;
do {
- nr_reclaimed += shrink_zone(priority, zone, &sc);
- priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ shrink_zone(zone, &sc);
+ } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
- if (slab_reclaimable > zone->min_slab_pages) {
+ nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages0 > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* number of slab pages and shake the slab until it is reduced
* by the same nr_pages that we used for reclaiming unmapped
* pages.
- *
- * Note that shrink_slab will free memory on all zones and may
- * take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
- slab_reclaimable - nr_pages)
- ;
+ nodes_clear(shrink.nodes_to_scan);
+ node_set(zone_to_nid(zone), shrink.nodes_to_scan);
+ for (;;) {
+ unsigned long lru_pages = zone_reclaimable_pages(zone);
+
+ /* No reclaimable slab or very low memory pressure */
+ if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
+ break;
+
+ /* Freed enough memory */
+ nr_slab_pages1 = zone_page_state(zone,
+ NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+ break;
+ }
/*
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- nr_reclaimed += slab_reclaimable -
- zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (nr_slab_pages1 < nr_slab_pages0)
+ sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- return nr_reclaimed >= nr_pages;
+ lockdep_clear_current_reclaim_state();
+ return sc.nr_reclaimed >= nr_pages;
}
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- cpumask_t mask;
int node_id;
+ int ret;
/*
* Zone reclaim reclaims unmapped file backed pages and
@@ -1658,22 +3693,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* if less than a specified percentage of the zone is used by
* unmapped file backed pages.
*/
- if (zone_page_state(zone, NR_FILE_PAGES) -
- zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
- && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
- <= zone->min_slab_pages)
- return 0;
+ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
+ return ZONE_RECLAIM_FULL;
+
+ if (!zone_reclaimable(zone))
+ return ZONE_RECLAIM_FULL;
/*
- * Avoid concurrent zone reclaims, do not reclaim in a zone that does
- * not have reclaimable pages and if we should not delay the allocation
- * then do not scan.
+ * Do not scan if the allocation should not be delayed.
*/
- if (!(gfp_mask & __GFP_WAIT) ||
- zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0 ||
- (current->flags & PF_MEMALLOC))
- return 0;
+ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ return ZONE_RECLAIM_NOSCAN;
/*
* Only run zone reclaim on the local zone or on zones that do not
@@ -1682,9 +3713,152 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* as wide as possible.
*/
node_id = zone_to_nid(zone);
- mask = node_to_cpumask(node_id);
- if (!cpus_empty(mask) && node_id != numa_node_id())
- return 0;
- return __zone_reclaim(zone, gfp_mask, order);
+ if (node_state(node_id, N_CPU) && node_id != numa_node_id())
+ return ZONE_RECLAIM_NOSCAN;
+
+ if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ return ZONE_RECLAIM_NOSCAN;
+
+ ret = __zone_reclaim(zone, gfp_mask, order);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+ if (!ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
+
+ return ret;
+}
+#endif
+
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+int page_evictable(struct page *page)
+{
+ return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+}
+
+#ifdef CONFIG_SHMEM
+/**
+ * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
+ * @pages: array of pages to check
+ * @nr_pages: number of pages to check
+ *
+ * Checks pages for evictability and moves them to the appropriate lru list.
+ *
+ * This function is only used for SysV IPC SHM_UNLOCK.
+ */
+void check_move_unevictable_pages(struct page **pages, int nr_pages)
+{
+ struct lruvec *lruvec;
+ struct zone *zone = NULL;
+ int pgscanned = 0;
+ int pgrescued = 0;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pages[i];
+ struct zone *pagezone;
+
+ pgscanned++;
+ pagezone = page_zone(page);
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+ lruvec = mem_cgroup_page_lruvec(page, zone);
+
+ if (!PageLRU(page) || !PageUnevictable(page))
+ continue;
+
+ if (page_evictable(page)) {
+ enum lru_list lru = page_lru_base_type(page);
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ ClearPageUnevictable(page);
+ del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, lru);
+ pgrescued++;
+ }
+ }
+
+ if (zone) {
+ __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
+ spin_unlock_irq(&zone->lru_lock);
+ }
+}
+#endif /* CONFIG_SHMEM */
+
+static void warn_scan_unevictable_pages(void)
+{
+ printk_once(KERN_WARNING
+ "%s: The scan_unevictable_pages sysctl/node-interface has been "
+ "disabled for lack of a legitimate use case. If you have "
+ "one, please send an email to linux-mm@kvack.org.\n",
+ current->comm);
+}
+
+/*
+ * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+
+int scan_unevictable_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ warn_scan_unevictable_pages();
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ scan_unevictable_pages = 0;
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * per node 'scan_unevictable_pages' attribute. On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+
+static ssize_t read_scan_unevictable_node(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ warn_scan_unevictable_pages();
+ return sprintf(buf, "0\n"); /* always zero; should fit... */
+}
+
+static ssize_t write_scan_unevictable_node(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ warn_scan_unevictable_pages();
+ return 1;
+}
+
+
+static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+ read_scan_unevictable_node,
+ write_scan_unevictable_node);
+
+int scan_unevictable_register_node(struct node *node)
+{
+ return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
+}
+
+void scan_unevictable_unregister_node(struct node *node)
+{
+ device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
}
#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2b6a9f96e5..b37bd49bfd5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -8,66 +8,35 @@
* Copyright (C) 2006 Silicon Graphics, Inc.,
* Christoph Lameter <christoph@lameter.com>
*/
-
-#include <linux/config.h>
+#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/vmstat.h>
+#include <linux/sched.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
+#include <linux/mm_inline.h>
-void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat)
-{
- struct zone *zones = pgdat->node_zones;
- int i;
-
- *active = 0;
- *inactive = 0;
- *free = 0;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- *active += zones[i].nr_active;
- *inactive += zones[i].nr_inactive;
- *free += zones[i].free_pages;
- }
-}
-
-void get_zone_counts(unsigned long *active,
- unsigned long *inactive, unsigned long *free)
-{
- struct pglist_data *pgdat;
-
- *active = 0;
- *inactive = 0;
- *free = 0;
- for_each_online_pgdat(pgdat) {
- unsigned long l, m, n;
- __get_zone_counts(&l, &m, &n, pgdat);
- *active += l;
- *inactive += m;
- *free += n;
- }
-}
+#include "internal.h"
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+static void sum_vm_events(unsigned long *ret)
{
- int cpu = 0;
+ int cpu;
int i;
memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
- cpu = first_cpu(*cpumask);
- while (cpu < NR_CPUS) {
+ for_each_online_cpu(cpu) {
struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
- cpu = next_cpu(cpu, *cpumask);
-
- if (cpu < NR_CPUS)
- prefetch(&per_cpu(vm_event_states, cpu));
-
-
for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
ret[i] += this->event[i];
}
@@ -80,11 +49,12 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
*/
void all_vm_events(unsigned long *ret)
{
- sum_vm_events(ret, &cpu_online_map);
+ get_online_cpus();
+ sum_vm_events(ret);
+ put_online_cpus();
}
EXPORT_SYMBOL_GPL(all_vm_events);
-#ifdef CONFIG_HOTPLUG
/*
* Fold the foreign cpu events into our own.
*
@@ -101,7 +71,6 @@ void vm_events_fold_cpu(int cpu)
fold_state->event[i] = 0;
}
}
-#endif /* CONFIG_HOTPLUG */
#endif /* CONFIG_VM_EVENT_COUNTERS */
@@ -110,12 +79,36 @@ void vm_events_fold_cpu(int cpu)
*
* vm_stat contains the global counters
*/
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -150,7 +143,7 @@ static int calculate_threshold(struct zone *zone)
* 125 1024 10 16-32 GB 9
*/
- mem = zone->present_pages >> (27 - PAGE_SHIFT);
+ mem = zone->managed_pages >> (27 - PAGE_SHIFT);
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
@@ -165,59 +158,80 @@ static int calculate_threshold(struct zone *zone)
/*
* Refresh the thresholds for each zone.
*/
-static void refresh_zone_stat_thresholds(void)
+void refresh_zone_stat_thresholds(void)
{
struct zone *zone;
int cpu;
int threshold;
- for_each_zone(zone) {
-
- if (!zone->present_pages)
- continue;
+ for_each_populated_zone(zone) {
+ unsigned long max_drift, tolerate_drift;
- threshold = calculate_threshold(zone);
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
- zone_pcp(zone, cpu)->stat_threshold = threshold;
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
+ }
+}
+
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
}
}
/*
- * For use when we know that interrupts are disabled.
+ * For use when we know that interrupts are disabled,
+ * or when we know that preemption is disabled and that
+ * particular counter cannot be updated from interrupt context.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
- x = delta + *p;
+ t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -240,18 +254,19 @@ EXPORT_SYMBOL(mod_zone_page_state);
* in between and therefore the atomicity vs. interrupt cannot be exploited
* in a useful way here.
*/
-static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)++;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -261,23 +276,117 @@ void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__inc_zone_page_state);
-void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)--;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ __dec_zone_state(page_zone(page), item);
+}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -308,52 +417,132 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
+
+static inline void fold_diff(int *diff)
+{
+ int i;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (diff[i])
+ atomic_long_add(diff[i], &vm_stat[i]);
+}
/*
- * Update the zone counters for one cpu.
+ * Update the zone counters for the current cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
*/
-void refresh_cpu_vm_stats(int cpu)
+static void refresh_cpu_vm_stats(void)
{
struct zone *zone;
int i;
- unsigned long flags;
+ int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
- for_each_zone(zone) {
- struct per_cpu_pageset *pcp;
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset __percpu *p = zone->pageset;
- if (!populated_zone(zone))
- continue;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ int v;
- pcp = zone_pcp(zone, cpu);
+ v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+ if (v) {
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (pcp->vm_stat_diff[i]) {
- local_irq_save(flags);
- zone_page_state_add(pcp->vm_stat_diff[i],
- zone, i);
- pcp->vm_stat_diff[i] = 0;
- local_irq_restore(flags);
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_diff[i] += v;
+#ifdef CONFIG_NUMA
+ /* 3 seconds idle till flush */
+ __this_cpu_write(p->expire, 3);
+#endif
}
+ }
+ cond_resched();
+#ifdef CONFIG_NUMA
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
+ !__this_cpu_read(p->pcp.count))
+ continue;
+
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
+
+
+ if (__this_cpu_dec_return(p->expire))
+ continue;
+
+ if (__this_cpu_read(p->pcp.count))
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+#endif
}
+ fold_diff(global_diff);
}
-static void __refresh_cpu_vm_stats(void *dummy)
+/*
+ * Fold the data for an offline cpu into the global array.
+ * There cannot be any access by the offline cpu and therefore
+ * synchronization is simplified.
+ */
+void cpu_vm_stats_fold(int cpu)
{
- refresh_cpu_vm_stats(smp_processor_id());
+ struct zone *zone;
+ int i;
+ int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p;
+
+ p = per_cpu_ptr(zone->pageset, cpu);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (p->vm_stat_diff[i]) {
+ int v;
+
+ v = p->vm_stat_diff[i];
+ p->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ global_diff[i] += v;
+ }
+ }
+
+ fold_diff(global_diff);
}
/*
- * Consolidate all counters.
- *
- * Note that the result is less inaccurate but still inaccurate
- * if concurrent processes are allowed to run.
+ * this is only called if !populated_zone(zone), which implies no other users of
+ * pset->vm_stat_diff[] exsist.
*/
-void refresh_vm_stats(void)
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
{
- on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
-}
-EXPORT_SYMBOL(refresh_vm_stats);
+ int i;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (pset->vm_stat_diff[i]) {
+ int v = pset->vm_stat_diff[i];
+ pset->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ atomic_long_add(v, &vm_stat[i]);
+ }
+}
#endif
#ifdef CONFIG_NUMA
@@ -362,26 +551,124 @@ EXPORT_SYMBOL(refresh_vm_stats);
* z = the zone from which the allocation occurred.
*
* Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
*/
-void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
{
- if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+ if (z->zone_pgdat == preferred_zone->zone_pgdat) {
__inc_zone_state(z, NUMA_HIT);
} else {
__inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+ __inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
- if (z->node == numa_node_id())
+ if (z->node == ((flags & __GFP_OTHER_NODE) ?
+ preferred_zone->node : numa_node_id()))
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
}
#endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_COMPACTION
+
+struct contig_page_info {
+ unsigned long free_pages;
+ unsigned long free_blocks_total;
+ unsigned long free_blocks_suitable;
+};
+/*
+ * Calculate the number of free pages in a zone, how many contiguous
+ * pages are free and how many are large enough to satisfy an allocation of
+ * the target size. Note that this function makes no attempt to estimate
+ * how many suitable free blocks there *might* be if MOVABLE pages were
+ * migrated. Calculating that is possible, but expensive and can be
+ * figured out from userspace
+ */
+static void fill_contig_page_info(struct zone *zone,
+ unsigned int suitable_order,
+ struct contig_page_info *info)
+{
+ unsigned int order;
+
+ info->free_pages = 0;
+ info->free_blocks_total = 0;
+ info->free_blocks_suitable = 0;
+
+ for (order = 0; order < MAX_ORDER; order++) {
+ unsigned long blocks;
+
+ /* Count number of free blocks */
+ blocks = zone->free_area[order].nr_free;
+ info->free_blocks_total += blocks;
+
+ /* Count free base pages */
+ info->free_pages += blocks << order;
+
+ /* Count the suitable free blocks */
+ if (order >= suitable_order)
+ info->free_blocks_suitable += blocks <<
+ (order - suitable_order);
+ }
+}
+
+/*
+ * A fragmentation index only makes sense if an allocation of a requested
+ * size would fail. If that is true, the fragmentation index indicates
+ * whether external fragmentation or a lack of memory was the problem.
+ * The value can be used to determine if page reclaim or compaction
+ * should be used
+ */
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
+{
+ unsigned long requested = 1UL << order;
+
+ if (!info->free_blocks_total)
+ return 0;
+
+ /* Fragmentation index only makes sense when a request would fail */
+ if (info->free_blocks_suitable)
+ return -1000;
+
+ /*
+ * Index is between 0 and 1 so return within 3 decimal places
+ *
+ * 0 => allocation would fail due to lack of memory
+ * 1 => allocation would fail due to fragmentation
+ */
+ return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
+}
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ return __fragmentation_index(order, &info);
+}
+#endif
+
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
+#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+static char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Reclaimable",
+ "Movable",
+ "Reserve",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
static void *frag_start(struct seq_file *m, loff_t *pos)
{
pg_data_t *pgdat;
@@ -406,37 +693,31 @@ static void frag_stop(struct seq_file *m, void *arg)
{
}
-/*
- * This walks the free areas for each zone.
- */
-static int frag_show(struct seq_file *m, void *arg)
+/* Walk all the zones in a node and print using a callback */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
- pg_data_t *pgdat = (pg_data_t *)arg;
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
- int order;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
- seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ print(m, pgdat, zone);
spin_unlock_irqrestore(&zone->lock, flags);
- seq_putc(m, '\n');
}
- return 0;
}
+#endif
-struct seq_operations fragmentation_op = {
- .start = frag_start,
- .next = frag_next,
- .stop = frag_stop,
- .show = frag_show,
-};
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
#ifdef CONFIG_ZONE_DMA32
#define TEXT_FOR_DMA32(xx) xx "_dma32",
@@ -450,22 +731,38 @@ struct seq_operations fragmentation_op = {
#define TEXT_FOR_HIGHMEM(xx)
#endif
-#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \
- TEXT_FOR_HIGHMEM(xx)
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+ TEXT_FOR_HIGHMEM(xx) xx "_movable",
-static char *vmstat_text[] = {
+const char * const vmstat_text[] = {
/* Zoned VM counters */
+ "nr_free_pages",
+ "nr_alloc_batch",
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+ "nr_unevictable",
+ "nr_mlock",
"nr_anon_pages",
"nr_mapped",
"nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
"nr_page_table_pages",
- "nr_dirty",
- "nr_writeback",
+ "nr_kernel_stack",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
+ "nr_vmscan_immediate_reclaim",
+ "nr_writeback_temp",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_shmem",
+ "nr_dirtied",
+ "nr_written",
#ifdef CONFIG_NUMA
"numa_hit",
@@ -475,6 +772,13 @@ static char *vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "workingset_refault",
+ "workingset_activate",
+ "workingset_nodereclaim",
+ "nr_anon_transparent_hugepages",
+ "nr_free_cma",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
"pgpgin",
@@ -492,115 +796,333 @@ static char *vmstat_text[] = {
"pgmajfault",
TEXTS_FOR_ZONES("pgrefill")
- TEXTS_FOR_ZONES("pgsteal")
+ TEXTS_FOR_ZONES("pgsteal_kswapd")
+ TEXTS_FOR_ZONES("pgsteal_direct")
TEXTS_FOR_ZONES("pgscan_kswapd")
TEXTS_FOR_ZONES("pgscan_direct")
+ "pgscan_direct_throttle",
+#ifdef CONFIG_NUMA
+ "zone_reclaim_failed",
+#endif
"pginodesteal",
"slabs_scanned",
- "kswapd_steal",
"kswapd_inodesteal",
+ "kswapd_low_wmark_hit_quickly",
+ "kswapd_high_wmark_hit_quickly",
"pageoutrun",
"allocstall",
"pgrotated",
+
+ "drop_pagecache",
+ "drop_slab",
+
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_huge_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+#endif
+#ifdef CONFIG_COMPACTION
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
+ "compact_stall",
+ "compact_fail",
+ "compact_success",
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+ "htlb_buddy_alloc_success",
+ "htlb_buddy_alloc_fail",
+#endif
+ "unevictable_pgs_culled",
+ "unevictable_pgs_scanned",
+ "unevictable_pgs_rescued",
+ "unevictable_pgs_mlocked",
+ "unevictable_pgs_munlocked",
+ "unevictable_pgs_cleared",
+ "unevictable_pgs_stranded",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_split",
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
+#endif
+#ifdef CONFIG_DEBUG_TLBFLUSH
+#ifdef CONFIG_SMP
+ "nr_tlb_remote_flush",
+ "nr_tlb_remote_flush_received",
+#endif /* CONFIG_SMP */
+ "nr_tlb_local_flush_all",
+ "nr_tlb_local_flush_one",
+#endif /* CONFIG_DEBUG_TLBFLUSH */
+
+#ifdef CONFIG_DEBUG_VM_VMACACHE
+ "vmacache_find_calls",
+ "vmacache_find_hits",
#endif
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
+
+
+#ifdef CONFIG_PROC_FS
+static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int order;
+
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_putc(m, '\n');
+}
/*
- * Output information about zones in @pgdat.
+ * This walks the free areas for each zone.
*/
-static int zoneinfo_show(struct seq_file *m, void *arg)
+static int frag_show(struct seq_file *m, void *arg)
{
- pg_data_t *pgdat = arg;
- struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
- unsigned long flags;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, frag_show_print);
+ return 0;
+}
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
- int i;
+static void pagetypeinfo_showfree_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int order, mtype;
+
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+ seq_printf(m, "Node %4d, zone %8s, type %12s ",
+ pgdat->node_id,
+ zone->name,
+ migratetype_names[mtype]);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ unsigned long freecount = 0;
+ struct free_area *area;
+ struct list_head *curr;
+
+ area = &(zone->free_area[order]);
+
+ list_for_each(curr, &area->free_list[mtype])
+ freecount++;
+ seq_printf(m, "%6lu ", freecount);
+ }
+ seq_putc(m, '\n');
+ }
+}
- if (!populated_zone(zone))
+/* Print out the free pages at each order for each migatetype */
+static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+{
+ int order;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* Print header */
+ seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+ for (order = 0; order < MAX_ORDER; ++order)
+ seq_printf(m, "%6d ", order);
+ seq_putc(m, '\n');
+
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+
+ return 0;
+}
+
+static void pagetypeinfo_showblockcount_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ int mtype;
+ unsigned long pfn;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long count[MIGRATE_TYPES] = { 0, };
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ if (!pfn_valid(pfn))
continue;
- spin_lock_irqsave(&zone->lock, flags);
- seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
- seq_printf(m,
- "\n pages free %lu"
- "\n min %lu"
- "\n low %lu"
- "\n high %lu"
- "\n active %lu"
- "\n inactive %lu"
- "\n scanned %lu (a: %lu i: %lu)"
- "\n spanned %lu"
- "\n present %lu",
- zone->free_pages,
- zone->pages_min,
- zone->pages_low,
- zone->pages_high,
- zone->nr_active,
- zone->nr_inactive,
- zone->pages_scanned,
- zone->nr_scan_active, zone->nr_scan_inactive,
- zone->spanned_pages,
- zone->present_pages);
+ page = pfn_to_page(pfn);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
+ /* Watch for unexpected holes punched in the memmap */
+ if (!memmap_valid_within(pfn, page, zone))
+ continue;
+ mtype = get_pageblock_migratetype(page);
+
+ if (mtype < MIGRATE_TYPES)
+ count[mtype]++;
+ }
+
+ /* Print counts */
+ seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12lu ", count[mtype]);
+ seq_putc(m, '\n');
+}
+
+/* Print out the free pages at each order for each migratetype */
+static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+{
+ int mtype;
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ seq_printf(m, "\n%-23s", "Number of blocks type ");
+ for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+ seq_printf(m, "%12s ", migratetype_names[mtype]);
+ seq_putc(m, '\n');
+ walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+
+ return 0;
+}
+
+/*
+ * This prints out statistics in relation to grouping pages by mobility.
+ * It is expensive to collect so do not constantly read the file.
+ */
+static int pagetypeinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_MEMORY))
+ return 0;
+
+ seq_printf(m, "Page block order: %d\n", pageblock_order);
+ seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
+ seq_putc(m, '\n');
+ pagetypeinfo_showfree(m, pgdat);
+ pagetypeinfo_showblockcount(m, pgdat);
+
+ return 0;
+}
+
+static const struct seq_operations fragmentation_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = frag_show,
+};
+
+static int fragmentation_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fragmentation_op);
+}
+
+static const struct file_operations fragmentation_file_operations = {
+ .open = fragmentation_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct seq_operations pagetypeinfo_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = pagetypeinfo_show,
+};
+
+static int pagetypeinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &pagetypeinfo_op);
+}
+
+static const struct file_operations pagetypeinfo_file_ops = {
+ .open = pagetypeinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ struct zone *zone)
+{
+ int i;
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ seq_printf(m,
+ "\n pages free %lu"
+ "\n min %lu"
+ "\n low %lu"
+ "\n high %lu"
+ "\n scanned %lu"
+ "\n spanned %lu"
+ "\n present %lu"
+ "\n managed %lu",
+ zone_page_state(zone, NR_FREE_PAGES),
+ min_wmark_pages(zone),
+ low_wmark_pages(zone),
+ high_wmark_pages(zone),
+ zone->pages_scanned,
+ zone->spanned_pages,
+ zone->present_pages,
+ zone->managed_pages);
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ for_each_online_cpu(i) {
+ struct per_cpu_pageset *pageset;
+
+ pageset = per_cpu_ptr(zone->pageset, i);
seq_printf(m,
- "\n protection: (%lu",
- zone->lowmem_reserve[0]);
- for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
- for_each_online_cpu(i) {
- struct per_cpu_pageset *pageset;
- int j;
-
- pageset = zone_pcp(zone, i);
- for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- if (pageset->pcp[j].count)
- break;
- }
- if (j == ARRAY_SIZE(pageset->pcp))
- continue;
- for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- seq_printf(m,
- "\n cpu: %i pcp: %i"
- "\n count: %i"
- "\n high: %i"
- "\n batch: %i",
- i, j,
- pageset->pcp[j].count,
- pageset->pcp[j].high,
- pageset->pcp[j].batch);
- }
+ "\n cpu: %i"
+ "\n count: %i"
+ "\n high: %i"
+ "\n batch: %i",
+ i,
+ pageset->pcp.count,
+ pageset->pcp.high,
+ pageset->pcp.batch);
#ifdef CONFIG_SMP
- seq_printf(m, "\n vm stats threshold: %d",
- pageset->stat_threshold);
+ seq_printf(m, "\n vm stats threshold: %d",
+ pageset->stat_threshold);
#endif
- }
- seq_printf(m,
- "\n all_unreclaimable: %u"
- "\n prev_priority: %i"
- "\n temp_priority: %i"
- "\n start_pfn: %lu",
- zone->all_unreclaimable,
- zone->prev_priority,
- zone->temp_priority,
- zone->zone_start_pfn);
- spin_unlock_irqrestore(&zone->lock, flags);
- seq_putc(m, '\n');
}
+ seq_printf(m,
+ "\n all_unreclaimable: %u"
+ "\n start_pfn: %lu"
+ "\n inactive_ratio: %u",
+ !zone_reclaimable(zone),
+ zone->zone_start_pfn,
+ zone->inactive_ratio);
+ seq_putc(m, '\n');
+}
+
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+ walk_zones_in_node(m, pgdat, zoneinfo_show_print);
return 0;
}
-struct seq_operations zoneinfo_op = {
+static const struct seq_operations zoneinfo_op = {
.start = frag_start, /* iterate over all zones. The same as in
* fragmentation. */
.next = frag_next,
@@ -608,36 +1130,56 @@ struct seq_operations zoneinfo_op = {
.show = zoneinfo_show,
};
+static int zoneinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &zoneinfo_op);
+}
+
+static const struct file_operations proc_zoneinfo_file_operations = {
+ .open = zoneinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- unsigned long *e;
-#endif
- int i;
+ int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
+ stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
+ stat_items_size += sizeof(struct vm_event_state);
#endif
+
+ v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
#endif
- return v + *pos;
+ return (unsigned long *)m->private + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -663,44 +1205,283 @@ static void vmstat_stop(struct seq_file *m, void *arg)
m->private = NULL;
}
-struct seq_operations vmstat_op = {
+static const struct seq_operations vmstat_op = {
.start = vmstat_start,
.next = vmstat_next,
.stop = vmstat_stop,
.show = vmstat_show,
};
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &vmstat_op);
+}
+
+static const struct file_operations proc_vmstat_file_operations = {
+ .open = vmstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+
+static void vmstat_update(struct work_struct *w)
+{
+ refresh_cpu_vm_stats();
+ schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+}
+
+static void start_cpu_timer(int cpu)
+{
+ struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
+ schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+}
+
+static void vmstat_cpu_dead(int node)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (cpu_to_node(cpu) == node)
+ goto end;
+
+ node_clear_state(node, N_CPU);
+end:
+ put_online_cpus();
+}
+
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
*/
-static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+static int vmstat_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
+ long cpu = (long)hcpu;
+
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- refresh_zone_stat_thresholds();
- break;
- default:
- break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ refresh_zone_stat_thresholds();
+ start_cpu_timer(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ per_cpu(vmstat_work, cpu).work.func = NULL;
+ break;
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ start_cpu_timer(cpu);
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ refresh_zone_stat_thresholds();
+ vmstat_cpu_dead(cpu_to_node(cpu));
+ break;
+ default:
+ break;
}
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata vmstat_notifier =
+static struct notifier_block vmstat_notifier =
{ &vmstat_cpuup_callback, NULL, 0 };
+#endif
-int __init setup_vmstat(void)
+static int __init setup_vmstat(void)
{
- refresh_zone_stat_thresholds();
- register_cpu_notifier(&vmstat_notifier);
+#ifdef CONFIG_SMP
+ int cpu;
+
+ cpu_notifier_register_begin();
+ __register_cpu_notifier(&vmstat_notifier);
+
+ for_each_online_cpu(cpu) {
+ start_cpu_timer(cpu);
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ }
+ cpu_notifier_register_done();
+#endif
+#ifdef CONFIG_PROC_FS
+ proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+ proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+ proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+ proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+#endif
return 0;
}
module_init(setup_vmstat)
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
+#include <linux/debugfs.h>
+
+
+/*
+ * Return an index indicating how much of the available free memory is
+ * unusable for an allocation of the requested size.
+ */
+static int unusable_free_index(unsigned int order,
+ struct contig_page_info *info)
+{
+ /* No free memory is interpreted as all free memory is unusable */
+ if (info->free_pages == 0)
+ return 1000;
+
+ /*
+ * Index should be a value between 0 and 1. Return a value to 3
+ * decimal places.
+ *
+ * 0 => no fragmentation
+ * 1 => high fragmentation
+ */
+ return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
+
+}
+
+static void unusable_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = unusable_free_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display unusable free space index
+ *
+ * The unusable free space index measures how much of the available free
+ * memory cannot be used to satisfy an allocation of a given size and is a
+ * value between 0 and 1. The higher the value, the more of free memory is
+ * unusable and by implication, the worse the external fragmentation is. This
+ * can be expressed as a percentage by multiplying by 100.
+ */
+static int unusable_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ /* check memoryless node */
+ if (!node_state(pgdat->node_id, N_MEMORY))
+ return 0;
+
+ walk_zones_in_node(m, pgdat, unusable_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations unusable_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = unusable_show,
+};
+
+static int unusable_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &unusable_op);
+}
+
+static const struct file_operations unusable_file_ops = {
+ .open = unusable_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void extfrag_show_print(struct seq_file *m,
+ pg_data_t *pgdat, struct zone *zone)
+{
+ unsigned int order;
+ int index;
+
+ /* Alloc on stack as interrupts are disabled for zone walk */
+ struct contig_page_info info;
+
+ seq_printf(m, "Node %d, zone %8s ",
+ pgdat->node_id,
+ zone->name);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ fill_contig_page_info(zone, order, &info);
+ index = __fragmentation_index(order, &info);
+ seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ }
+
+ seq_putc(m, '\n');
+}
+
+/*
+ * Display fragmentation index for orders that allocations would fail for
+ */
+static int extfrag_show(struct seq_file *m, void *arg)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ walk_zones_in_node(m, pgdat, extfrag_show_print);
+
+ return 0;
+}
+
+static const struct seq_operations extfrag_op = {
+ .start = frag_start,
+ .next = frag_next,
+ .stop = frag_stop,
+ .show = extfrag_show,
+};
+
+static int extfrag_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &extfrag_op);
+}
+
+static const struct file_operations extfrag_file_ops = {
+ .open = extfrag_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init extfrag_debug_init(void)
+{
+ struct dentry *extfrag_debug_root;
+
+ extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
+ if (!extfrag_debug_root)
+ return -ENOMEM;
+
+ if (!debugfs_create_file("unusable_index", 0444,
+ extfrag_debug_root, NULL, &unusable_file_ops))
+ goto fail;
+
+ if (!debugfs_create_file("extfrag_index", 0444,
+ extfrag_debug_root, NULL, &extfrag_file_ops))
+ goto fail;
+
+ return 0;
+fail:
+ debugfs_remove_recursive(extfrag_debug_root);
+ return -ENOMEM;
+}
+
+module_init(extfrag_debug_init);
#endif
diff --git a/mm/workingset.c b/mm/workingset.c
new file mode 100644
index 00000000000..f7216fa7da2
--- /dev/null
+++ b/mm/workingset.c
@@ -0,0 +1,414 @@
+/*
+ * Workingset detection
+ *
+ * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/memcontrol.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+/*
+ * Double CLOCK lists
+ *
+ * Per zone, two clock lists are maintained for file pages: the
+ * inactive and the active list. Freshly faulted pages start out at
+ * the head of the inactive list and page reclaim scans pages from the
+ * tail. Pages that are accessed multiple times on the inactive list
+ * are promoted to the active list, to protect them from reclaim,
+ * whereas active pages are demoted to the inactive list when the
+ * active list grows too big.
+ *
+ * fault ------------------------+
+ * |
+ * +--------------+ | +-------------+
+ * reclaim <- | inactive | <-+-- demotion | active | <--+
+ * +--------------+ +-------------+ |
+ * | |
+ * +-------------- promotion ------------------+
+ *
+ *
+ * Access frequency and refault distance
+ *
+ * A workload is thrashing when its pages are frequently used but they
+ * are evicted from the inactive list every time before another access
+ * would have promoted them to the active list.
+ *
+ * In cases where the average access distance between thrashing pages
+ * is bigger than the size of memory there is nothing that can be
+ * done - the thrashing set could never fit into memory under any
+ * circumstance.
+ *
+ * However, the average access distance could be bigger than the
+ * inactive list, yet smaller than the size of memory. In this case,
+ * the set could fit into memory if it weren't for the currently
+ * active pages - which may be used more, hopefully less frequently:
+ *
+ * +-memory available to cache-+
+ * | |
+ * +-inactive------+-active----+
+ * a b | c d e f g h i | J K L M N |
+ * +---------------+-----------+
+ *
+ * It is prohibitively expensive to accurately track access frequency
+ * of pages. But a reasonable approximation can be made to measure
+ * thrashing on the inactive list, after which refaulting pages can be
+ * activated optimistically to compete with the existing active pages.
+ *
+ * Approximating inactive page access frequency - Observations:
+ *
+ * 1. When a page is accessed for the first time, it is added to the
+ * head of the inactive list, slides every existing inactive page
+ * towards the tail by one slot, and pushes the current tail page
+ * out of memory.
+ *
+ * 2. When a page is accessed for the second time, it is promoted to
+ * the active list, shrinking the inactive list by one slot. This
+ * also slides all inactive pages that were faulted into the cache
+ * more recently than the activated page towards the tail of the
+ * inactive list.
+ *
+ * Thus:
+ *
+ * 1. The sum of evictions and activations between any two points in
+ * time indicate the minimum number of inactive pages accessed in
+ * between.
+ *
+ * 2. Moving one inactive page N page slots towards the tail of the
+ * list requires at least N inactive page accesses.
+ *
+ * Combining these:
+ *
+ * 1. When a page is finally evicted from memory, the number of
+ * inactive pages accessed while the page was in cache is at least
+ * the number of page slots on the inactive list.
+ *
+ * 2. In addition, measuring the sum of evictions and activations (E)
+ * at the time of a page's eviction, and comparing it to another
+ * reading (R) at the time the page faults back into memory tells
+ * the minimum number of accesses while the page was not cached.
+ * This is called the refault distance.
+ *
+ * Because the first access of the page was the fault and the second
+ * access the refault, we combine the in-cache distance with the
+ * out-of-cache distance to get the complete minimum access distance
+ * of this page:
+ *
+ * NR_inactive + (R - E)
+ *
+ * And knowing the minimum access distance of a page, we can easily
+ * tell if the page would be able to stay in cache assuming all page
+ * slots in the cache were available:
+ *
+ * NR_inactive + (R - E) <= NR_inactive + NR_active
+ *
+ * which can be further simplified to
+ *
+ * (R - E) <= NR_active
+ *
+ * Put into words, the refault distance (out-of-cache) can be seen as
+ * a deficit in inactive list space (in-cache). If the inactive list
+ * had (R - E) more page slots, the page would not have been evicted
+ * in between accesses, but activated instead. And on a full system,
+ * the only thing eating into inactive list space is active pages.
+ *
+ *
+ * Activating refaulting pages
+ *
+ * All that is known about the active list is that the pages have been
+ * accessed more than once in the past. This means that at any given
+ * time there is actually a good chance that pages on the active list
+ * are no longer in active use.
+ *
+ * So when a refault distance of (R - E) is observed and there are at
+ * least (R - E) active pages, the refaulting page is activated
+ * optimistically in the hope that (R - E) active pages are actually
+ * used less frequently than the refaulting page - or even not used at
+ * all anymore.
+ *
+ * If this is wrong and demotion kicks in, the pages which are truly
+ * used more frequently will be reactivated while the less frequently
+ * used once will be evicted from memory.
+ *
+ * But if this is right, the stale pages will be pushed out of memory
+ * and the used pages get to stay in cache.
+ *
+ *
+ * Implementation
+ *
+ * For each zone's file LRU lists, a counter for inactive evictions
+ * and activations is maintained (zone->inactive_age).
+ *
+ * On eviction, a snapshot of this counter (along with some bits to
+ * identify the zone) is stored in the now empty page cache radix tree
+ * slot of the evicted page. This is called a shadow entry.
+ *
+ * On cache misses for which there are shadow entries, an eligible
+ * refault distance will immediately activate the refaulting page.
+ */
+
+static void *pack_shadow(unsigned long eviction, struct zone *zone)
+{
+ eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
+ eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
+ eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+ return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+}
+
+static void unpack_shadow(void *shadow,
+ struct zone **zone,
+ unsigned long *distance)
+{
+ unsigned long entry = (unsigned long)shadow;
+ unsigned long eviction;
+ unsigned long refault;
+ unsigned long mask;
+ int zid, nid;
+
+ entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+ zid = entry & ((1UL << ZONES_SHIFT) - 1);
+ entry >>= ZONES_SHIFT;
+ nid = entry & ((1UL << NODES_SHIFT) - 1);
+ entry >>= NODES_SHIFT;
+ eviction = entry;
+
+ *zone = NODE_DATA(nid)->node_zones + zid;
+
+ refault = atomic_long_read(&(*zone)->inactive_age);
+ mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
+ RADIX_TREE_EXCEPTIONAL_SHIFT);
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ *distance = (refault - eviction) & mask;
+}
+
+/**
+ * workingset_eviction - note the eviction of a page from memory
+ * @mapping: address space the page was backing
+ * @page: the page being evicted
+ *
+ * Returns a shadow entry to be stored in @mapping->page_tree in place
+ * of the evicted @page so that a later refault can be detected.
+ */
+void *workingset_eviction(struct address_space *mapping, struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long eviction;
+
+ eviction = atomic_long_inc_return(&zone->inactive_age);
+ return pack_shadow(eviction, zone);
+}
+
+/**
+ * workingset_refault - evaluate the refault of a previously evicted page
+ * @shadow: shadow entry of the evicted page
+ *
+ * Calculates and evaluates the refault distance of the previously
+ * evicted page in the context of the zone it was allocated in.
+ *
+ * Returns %true if the page should be activated, %false otherwise.
+ */
+bool workingset_refault(void *shadow)
+{
+ unsigned long refault_distance;
+ struct zone *zone;
+
+ unpack_shadow(shadow, &zone, &refault_distance);
+ inc_zone_state(zone, WORKINGSET_REFAULT);
+
+ if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ inc_zone_state(zone, WORKINGSET_ACTIVATE);
+ return true;
+ }
+ return false;
+}
+
+/**
+ * workingset_activation - note a page activation
+ * @page: page that is being activated
+ */
+void workingset_activation(struct page *page)
+{
+ atomic_long_inc(&page_zone(page)->inactive_age);
+}
+
+/*
+ * Shadow entries reflect the share of the working set that does not
+ * fit into memory, so their number depends on the access pattern of
+ * the workload. In most cases, they will refault or get reclaimed
+ * along with the inode, but a (malicious) workload that streams
+ * through files with a total size several times that of available
+ * memory, while preventing the inodes from being reclaimed, can
+ * create excessive amounts of shadow nodes. To keep a lid on this,
+ * track shadow nodes and reclaim them when they grow way past the
+ * point where they would still be useful.
+ */
+
+struct list_lru workingset_shadow_nodes;
+
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long shadow_nodes;
+ unsigned long max_nodes;
+ unsigned long pages;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ local_irq_enable();
+
+ pages = node_present_pages(sc->nid);
+ /*
+ * Active cache pages are limited to 50% of memory, and shadow
+ * entries that represent a refault distance bigger than that
+ * do not have any effect. Limit the number of shadow nodes
+ * such that shadow entries do not exceed the number of active
+ * cache pages, assuming a worst-case node population density
+ * of 1/8th on average.
+ *
+ * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * each, this will reclaim shadow entries when they consume
+ * ~2% of available memory:
+ *
+ * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+ */
+ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+
+ if (shadow_nodes <= max_nodes)
+ return 0;
+
+ return shadow_nodes - max_nodes;
+}
+
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct address_space *mapping;
+ struct radix_tree_node *node;
+ unsigned int i;
+ int ret;
+
+ /*
+ * Page cache insertions and deletions synchroneously maintain
+ * the shadow node LRU under the mapping->tree_lock and the
+ * lru_lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the lru_lock pins any
+ * address_space that has radix tree nodes on the LRU.
+ *
+ * We can then safely transition to the mapping->tree_lock to
+ * pin only the address_space of the particular node we want
+ * to reclaim, take the node off-LRU, and drop the lru_lock.
+ */
+
+ node = container_of(item, struct radix_tree_node, private_list);
+ mapping = node->private_data;
+
+ /* Coming from the list, invert the lock order */
+ if (!spin_trylock(&mapping->tree_lock)) {
+ spin_unlock(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
+ list_del_init(item);
+ spin_unlock(lru_lock);
+
+ /*
+ * The nodes should only contain one or more shadow entries,
+ * no pages, so we expect to be able to remove them all and
+ * delete and free the empty node afterwards.
+ */
+
+ BUG_ON(!node->count);
+ BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
+
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[i]) {
+ BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ node->slots[i] = NULL;
+ BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
+ node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+ BUG_ON(!mapping->nrshadows);
+ mapping->nrshadows--;
+ }
+ }
+ BUG_ON(node->count);
+ inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ if (!__radix_tree_delete_node(&mapping->page_tree, node))
+ BUG();
+
+ spin_unlock(&mapping->tree_lock);
+ ret = LRU_REMOVED_RETRY;
+out:
+ local_irq_enable();
+ cond_resched();
+ local_irq_disable();
+ spin_lock(lru_lock);
+ return ret;
+}
+
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long ret;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+ shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ local_irq_enable();
+ return ret;
+}
+
+static struct shrinker workingset_shadow_shrinker = {
+ .count_objects = count_shadow_nodes,
+ .scan_objects = scan_shadow_nodes,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
+};
+
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
+static int __init workingset_init(void)
+{
+ int ret;
+
+ ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ if (ret)
+ goto err;
+ ret = register_shrinker(&workingset_shadow_shrinker);
+ if (ret)
+ goto err_list_lru;
+ return 0;
+err_list_lru:
+ list_lru_destroy(&workingset_shadow_nodes);
+err:
+ return ret;
+}
+module_init(workingset_init);
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 00000000000..01df13a7e2e
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages. Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page". The first buddy is "left
+ * justified" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page. The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1. This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks". The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer. The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
+ * will be 64 freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock: protects all pool fields and first|last_chunk fields of any
+ * zbud page in the pool
+ * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
+ * the lists each zbud page is added to depends on the size of
+ * its free region.
+ * @buddied: list tracking the zbud pages that contain two buddies;
+ * these zbud pages are full
+ * @lru: list tracking the zbud pages in LRU order by most recently
+ * added buddy.
+ * @pages_nr: number of zbud pages in the pool.
+ * @ops: pointer to a structure of user defined operations specified at
+ * pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+ spinlock_t lock;
+ struct list_head unbuddied[NCHUNKS];
+ struct list_head buddied;
+ struct list_head lru;
+ u64 pages_nr;
+ struct zbud_ops *ops;
+};
+
+/*
+ * struct zbud_header - zbud page metadata occupying the first chunk of each
+ * zbud page.
+ * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru: links the zbud page into the lru list in the pool
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ */
+struct zbud_header {
+ struct list_head buddy;
+ struct list_head lru;
+ unsigned int first_chunks;
+ unsigned int last_chunks;
+ bool under_reclaim;
+};
+
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+ FIRST,
+ LAST
+};
+
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(int size)
+{
+ return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the zbud header of a newly allocated zbud page */
+static struct zbud_header *init_zbud_page(struct page *page)
+{
+ struct zbud_header *zhdr = page_address(page);
+ zhdr->first_chunks = 0;
+ zhdr->last_chunks = 0;
+ INIT_LIST_HEAD(&zhdr->buddy);
+ INIT_LIST_HEAD(&zhdr->lru);
+ zhdr->under_reclaim = 0;
+ return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_header *zhdr)
+{
+ __free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
+{
+ unsigned long handle;
+
+ /*
+ * For now, the encoded handle is actually just the pointer to the data
+ * but this might not always be the case. A little information hiding.
+ * Add CHUNK_SIZE to the handle if it is the first allocation to jump
+ * over the zbud header in the first chunk.
+ */
+ handle = (unsigned long)zhdr;
+ if (bud == FIRST)
+ /* skip over zbud header */
+ handle += ZHDR_SIZE_ALIGNED;
+ else /* bud == LAST */
+ handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+ return handle;
+}
+
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_header *handle_to_zbud_header(unsigned long handle)
+{
+ return (struct zbud_header *)(handle & PAGE_MASK);
+}
+
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_header *zhdr)
+{
+ /*
+ * Rather than branch for different situations, just use the fact that
+ * free buddies have a length of zero to simplify everything. -1 at the
+ * end for the zbud header.
+ */
+ return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp: gfp flags when allocating the zbud pool structure
+ * @ops: user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+{
+ struct zbud_pool *pool;
+ int i;
+
+ pool = kmalloc(sizeof(struct zbud_pool), gfp);
+ if (!pool)
+ return NULL;
+ spin_lock_init(&pool->lock);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&pool->unbuddied[i]);
+ INIT_LIST_HEAD(&pool->buddied);
+ INIT_LIST_HEAD(&pool->lru);
+ pool->pages_nr = 0;
+ pool->ops = ops;
+ return pool;
+}
+
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool: the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+ kfree(pool);
+}
+
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool: zbud pool from which to allocate
+ * @size: size in bytes of the desired allocation
+ * @gfp: gfp flags used if the pool needs to grow
+ * @handle: handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request. A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp,
+ unsigned long *handle)
+{
+ int chunks, i, freechunks;
+ struct zbud_header *zhdr = NULL;
+ enum buddy bud;
+ struct page *page;
+
+ if (!size || (gfp & __GFP_HIGHMEM))
+ return -EINVAL;
+ if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+ return -ENOSPC;
+ chunks = size_to_chunks(size);
+ spin_lock(&pool->lock);
+
+ /* First, try to find an unbuddied zbud page. */
+ zhdr = NULL;
+ for_each_unbuddied_list(i, chunks) {
+ if (!list_empty(&pool->unbuddied[i])) {
+ zhdr = list_first_entry(&pool->unbuddied[i],
+ struct zbud_header, buddy);
+ list_del(&zhdr->buddy);
+ if (zhdr->first_chunks == 0)
+ bud = FIRST;
+ else
+ bud = LAST;
+ goto found;
+ }
+ }
+
+ /* Couldn't find unbuddied zbud page, create new one */
+ spin_unlock(&pool->lock);
+ page = alloc_page(gfp);
+ if (!page)
+ return -ENOMEM;
+ spin_lock(&pool->lock);
+ pool->pages_nr++;
+ zhdr = init_zbud_page(page);
+ bud = FIRST;
+
+found:
+ if (bud == FIRST)
+ zhdr->first_chunks = chunks;
+ else
+ zhdr->last_chunks = chunks;
+
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* Add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+ /* Add/move zbud page to beginning of LRU */
+ if (!list_empty(&zhdr->lru))
+ list_del(&zhdr->lru);
+ list_add(&zhdr->lru, &pool->lru);
+
+ *handle = encode_handle(zhdr, bud);
+ spin_unlock(&pool->lock);
+
+ return 0;
+}
+
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool: pool in which the allocation resided
+ * @handle: handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0. The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+ struct zbud_header *zhdr;
+ int freechunks;
+
+ spin_lock(&pool->lock);
+ zhdr = handle_to_zbud_header(handle);
+
+ /* If first buddy, handle will be page aligned */
+ if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
+ zhdr->last_chunks = 0;
+ else
+ zhdr->first_chunks = 0;
+
+ if (zhdr->under_reclaim) {
+ /* zbud page is under reclaim, reclaim will free */
+ spin_unlock(&pool->lock);
+ return;
+ }
+
+ /* Remove from existing buddy list */
+ list_del(&zhdr->buddy);
+
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /* zbud page is empty, free */
+ list_del(&zhdr->lru);
+ free_zbud_page(zhdr);
+ pool->pages_nr--;
+ } else {
+ /* Add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ }
+
+ spin_unlock(&pool->lock);
+}
+
+#define list_tail_entry(ptr, type, member) \
+ list_entry((ptr)->prev, type, member)
+
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool: pool from which a page will attempt to be evicted
+ * @retires: number of pages on the LRU list for which eviction will
+ * be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up. This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+ int i, ret, freechunks;
+ struct zbud_header *zhdr;
+ unsigned long first_handle = 0, last_handle = 0;
+
+ spin_lock(&pool->lock);
+ if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+ retries == 0) {
+ spin_unlock(&pool->lock);
+ return -EINVAL;
+ }
+ for (i = 0; i < retries; i++) {
+ zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
+ list_del(&zhdr->lru);
+ list_del(&zhdr->buddy);
+ /* Protect zbud page against free */
+ zhdr->under_reclaim = true;
+ /*
+ * We need encode the handles before unlocking, since we can
+ * race with free that will set (first|last)_chunks to 0
+ */
+ first_handle = 0;
+ last_handle = 0;
+ if (zhdr->first_chunks)
+ first_handle = encode_handle(zhdr, FIRST);
+ if (zhdr->last_chunks)
+ last_handle = encode_handle(zhdr, LAST);
+ spin_unlock(&pool->lock);
+
+ /* Issue the eviction callback(s) */
+ if (first_handle) {
+ ret = pool->ops->evict(pool, first_handle);
+ if (ret)
+ goto next;
+ }
+ if (last_handle) {
+ ret = pool->ops->evict(pool, last_handle);
+ if (ret)
+ goto next;
+ }
+next:
+ spin_lock(&pool->lock);
+ zhdr->under_reclaim = false;
+ if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+ /*
+ * Both buddies are now free, free the zbud page and
+ * return success.
+ */
+ free_zbud_page(zhdr);
+ pool->pages_nr--;
+ spin_unlock(&pool->lock);
+ return 0;
+ } else if (zhdr->first_chunks == 0 ||
+ zhdr->last_chunks == 0) {
+ /* add to unbuddied list */
+ freechunks = num_free_chunks(zhdr);
+ list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ } else {
+ /* add to buddied list */
+ list_add(&zhdr->buddy, &pool->buddied);
+ }
+
+ /* add to beginning of LRU */
+ list_add(&zhdr->lru, &pool->lru);
+ }
+ spin_unlock(&pool->lock);
+ return -EAGAIN;
+}
+
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+ return (void *)(handle);
+}
+
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool: pool in which the allocation resides
+ * @handle: handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool: pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool. The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+ return pool->pages_nr;
+}
+
+static int __init init_zbud(void)
+{
+ /* Make sure the zbud header will fit in one chunk */
+ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
+ pr_info("loaded\n");
+ return 0;
+}
+
+static void __exit exit_zbud(void)
+{
+ pr_info("unloaded\n");
+}
+
+module_init(init_zbud);
+module_exit(exit_zbud);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 00000000000..fe78189624c
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1117 @@
+/*
+ * zsmalloc memory allocator
+ *
+ * Copyright (C) 2011 Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the license that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+/*
+ * This allocator is designed for use with zram. Thus, the allocator is
+ * supposed to work well under low memory conditions. In particular, it
+ * never attempts higher order page allocation which is very likely to
+ * fail under memory pressure. On the other hand, if we just use single
+ * (0-order) pages, it would suffer from very high fragmentation --
+ * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+ * This was one of the major issues with its predecessor (xvmalloc).
+ *
+ * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+ * and links them together using various 'struct page' fields. These linked
+ * pages act as a single higher-order page i.e. an object can span 0-order
+ * page boundaries. The code refers to these linked pages as a single entity
+ * called zspage.
+ *
+ * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+ * since this satisfies the requirements of all its current users (in the
+ * worst case, page is incompressible and is thus stored "as-is" i.e. in
+ * uncompressed form). For allocation requests larger than this size, failure
+ * is returned (see zs_malloc).
+ *
+ * Additionally, zs_malloc() does not return a dereferenceable pointer.
+ * Instead, it returns an opaque handle (unsigned long) which encodes actual
+ * location of the allocated object. The reason for this indirection is that
+ * zsmalloc does not keep zspages permanently mapped since that would cause
+ * issues on 32-bit systems where the VA region for kernel space mappings
+ * is very small. So, before using the allocating memory, the object has to
+ * be mapped using zs_map_object() to get a usable pointer and subsequently
+ * unmapped using zs_unmap_object().
+ *
+ * Following is how we use various fields and flags of underlying
+ * struct page(s) to form a zspage.
+ *
+ * Usage of struct page fields:
+ * page->first_page: points to the first component (0-order) page
+ * page->index (union with page->freelist): offset of the first object
+ * starting in this page. For the first page, this is
+ * always 0, so we use this field (aka freelist) to point
+ * to the first free object in zspage.
+ * page->lru: links together all component pages (except the first page)
+ * of a zspage
+ *
+ * For _first_ page only:
+ *
+ * page->private (union with page->first_page): refers to the
+ * component page after the first page
+ * page->freelist: points to the first free object in zspage.
+ * Free objects are linked together using in-place
+ * metadata.
+ * page->objects: maximum number of objects we can store in this
+ * zspage (class->zspage_order * PAGE_SIZE / class->size)
+ * page->lru: links together first pages of various zspages.
+ * Basically forming list of zspages in a fullness group.
+ * page->mapping: class index and fullness group of the zspage
+ *
+ * Usage of struct page flags:
+ * PG_private: identifies the first component page
+ * PG_private2: identifies the last component page
+ *
+ */
+
+#ifdef CONFIG_ZSMALLOC_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/zsmalloc.h>
+
+/*
+ * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * These two conditions ensure that any 'struct link_free' itself doesn't
+ * span more than 1 page which avoids complex case of mapping 2 pages simply
+ * to restore link_free pointer values.
+ */
+#define ZS_ALIGN 8
+
+/*
+ * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
+ * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
+ */
+#define ZS_MAX_ZSPAGE_ORDER 2
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+
+/*
+ * Object location (<PFN>, <obj_idx>) is encoded as
+ * as single (unsigned long) handle value.
+ *
+ * Note that object index <obj_idx> is relative to system
+ * page <PFN> it is stored in, so for each sub-page belonging
+ * to a zspage, obj_idx starts with 0.
+ *
+ * This is made more complicated by various memory models and PAE.
+ */
+
+#ifndef MAX_PHYSMEM_BITS
+#ifdef CONFIG_HIGHMEM64G
+#define MAX_PHYSMEM_BITS 36
+#else /* !CONFIG_HIGHMEM64G */
+/*
+ * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * be PAGE_SHIFT
+ */
+#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#endif
+#endif
+#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
+#define ZS_MIN_ALLOC_SIZE \
+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
+
+/*
+ * On systems with 4K page size, this gives 255 size classes! There is a
+ * trader-off here:
+ * - Large number of size classes is potentially wasteful as free page are
+ * spread across these classes
+ * - Small number of size classes causes large internal fragmentation
+ * - Probably its better to use specific size classes (empirically
+ * determined). NOTE: all those class sizes must be set as multiple of
+ * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
+ *
+ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
+ * (reason above)
+ */
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
+ ZS_SIZE_CLASS_DELTA + 1)
+
+/*
+ * We do not maintain any list for completely empty or full pages
+ */
+enum fullness_group {
+ ZS_ALMOST_FULL,
+ ZS_ALMOST_EMPTY,
+ _ZS_NR_FULLNESS_GROUPS,
+
+ ZS_EMPTY,
+ ZS_FULL
+};
+
+/*
+ * We assign a page to ZS_ALMOST_EMPTY fullness group when:
+ * n <= N / f, where
+ * n = number of allocated objects
+ * N = total number of objects zspage can store
+ * f = 1/fullness_threshold_frac
+ *
+ * Similarly, we assign zspage to:
+ * ZS_ALMOST_FULL when n > N / f
+ * ZS_EMPTY when n == 0
+ * ZS_FULL when n == N
+ *
+ * (see: fix_fullness_group())
+ */
+static const int fullness_threshold_frac = 4;
+
+struct size_class {
+ /*
+ * Size of objects stored in this class. Must be multiple
+ * of ZS_ALIGN.
+ */
+ int size;
+ unsigned int index;
+
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
+
+ spinlock_t lock;
+
+ /* stats */
+ u64 pages_allocated;
+
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+};
+
+/*
+ * Placed within free objects to form a singly linked list.
+ * For every zspage, first_page->freelist gives head of this list.
+ *
+ * This must be power of 2 and less than or equal to ZS_ALIGN
+ */
+struct link_free {
+ /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+ void *next;
+};
+
+struct zs_pool {
+ struct size_class size_class[ZS_SIZE_CLASSES];
+
+ gfp_t flags; /* allocation flags used when growing pool */
+};
+
+/*
+ * A zspage's class index and fullness group
+ * are encoded in its (first)page->mapping
+ */
+#define CLASS_IDX_BITS 28
+#define FULLNESS_BITS 4
+#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
+#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
+
+struct mapping_area {
+#ifdef CONFIG_PGTABLE_MAPPING
+ struct vm_struct *vm; /* vm area for mapping object that span pages */
+#else
+ char *vm_buf; /* copy buffer for objects that span pages */
+#endif
+ char *vm_addr; /* address of kmap_atomic()'ed pages */
+ enum zs_mapmode vm_mm; /* mapping mode */
+};
+
+
+/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+
+static int is_first_page(struct page *page)
+{
+ return PagePrivate(page);
+}
+
+static int is_last_page(struct page *page)
+{
+ return PagePrivate2(page);
+}
+
+static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+ enum fullness_group *fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = (unsigned long)page->mapping;
+ *fullness = m & FULLNESS_MASK;
+ *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+}
+
+static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+ enum fullness_group fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
+ (fullness & FULLNESS_MASK);
+ page->mapping = (struct address_space *)m;
+}
+
+/*
+ * zsmalloc divides the pool into various size classes where each
+ * class maintains a list of zspages where each zspage is divided
+ * into equal sized chunks. Each allocation falls into one of these
+ * classes depending on its size. This function returns index of the
+ * size class which has chunk size big enough to hold the give size.
+ */
+static int get_size_class_index(int size)
+{
+ int idx = 0;
+
+ if (likely(size > ZS_MIN_ALLOC_SIZE))
+ idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
+ ZS_SIZE_CLASS_DELTA);
+
+ return idx;
+}
+
+/*
+ * For each size class, zspages are divided into different groups
+ * depending on how "full" they are. This was done so that we could
+ * easily find empty or nearly empty zspages when we try to shrink
+ * the pool (not yet implemented). This function returns fullness
+ * status of the given page.
+ */
+static enum fullness_group get_fullness_group(struct page *page)
+{
+ int inuse, max_objects;
+ enum fullness_group fg;
+ BUG_ON(!is_first_page(page));
+
+ inuse = page->inuse;
+ max_objects = page->objects;
+
+ if (inuse == 0)
+ fg = ZS_EMPTY;
+ else if (inuse == max_objects)
+ fg = ZS_FULL;
+ else if (inuse <= max_objects / fullness_threshold_frac)
+ fg = ZS_ALMOST_EMPTY;
+ else
+ fg = ZS_ALMOST_FULL;
+
+ return fg;
+}
+
+/*
+ * Each size class maintains various freelists and zspages are assigned
+ * to one of these freelists based on the number of live objects they
+ * have. This functions inserts the given zspage into the freelist
+ * identified by <class, fullness_group>.
+ */
+static void insert_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ if (*head)
+ list_add_tail(&page->lru, &(*head)->lru);
+
+ *head = page;
+}
+
+/*
+ * This function removes the given zspage from the freelist identified
+ * by <class, fullness_group>.
+ */
+static void remove_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ BUG_ON(!*head);
+ if (list_empty(&(*head)->lru))
+ *head = NULL;
+ else if (*head == page)
+ *head = (struct page *)list_entry((*head)->lru.next,
+ struct page, lru);
+
+ list_del_init(&page->lru);
+}
+
+/*
+ * Each size class maintains zspages in different fullness groups depending
+ * on the number of live objects they contain. When allocating or freeing
+ * objects, the fullness status of the page can change, say, from ALMOST_FULL
+ * to ALMOST_EMPTY when freeing an object. This function checks if such
+ * a status change has occurred for the given page and accordingly moves the
+ * page from the freelist of the old fullness group to that of the new
+ * fullness group.
+ */
+static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+ struct page *page)
+{
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group currfg, newfg;
+
+ BUG_ON(!is_first_page(page));
+
+ get_zspage_mapping(page, &class_idx, &currfg);
+ newfg = get_fullness_group(page);
+ if (newfg == currfg)
+ goto out;
+
+ class = &pool->size_class[class_idx];
+ remove_zspage(page, class, currfg);
+ insert_zspage(page, class, newfg);
+ set_zspage_mapping(page, class_idx, newfg);
+
+out:
+ return newfg;
+}
+
+/*
+ * We have to decide on how many pages to link together
+ * to form a zspage for each size class. This is important
+ * to reduce wastage due to unusable space left at end of
+ * each zspage which is given as:
+ * wastage = Zp - Zp % size_class
+ * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
+ *
+ * For example, for size class of 3/8 * PAGE_SIZE, we should
+ * link together 3 PAGE_SIZE sized pages to form a zspage
+ * since then we can perfectly fit in 8 such objects.
+ */
+static int get_pages_per_zspage(int class_size)
+{
+ int i, max_usedpc = 0;
+ /* zspage order which gives maximum used size per KB */
+ int max_usedpc_order = 1;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int zspage_size;
+ int waste, usedpc;
+
+ zspage_size = i * PAGE_SIZE;
+ waste = zspage_size % class_size;
+ usedpc = (zspage_size - waste) * 100 / zspage_size;
+
+ if (usedpc > max_usedpc) {
+ max_usedpc = usedpc;
+ max_usedpc_order = i;
+ }
+ }
+
+ return max_usedpc_order;
+}
+
+/*
+ * A single 'zspage' is composed of many system pages which are
+ * linked together using fields in struct page. This function finds
+ * the first/head page, given any component page of a zspage.
+ */
+static struct page *get_first_page(struct page *page)
+{
+ if (is_first_page(page))
+ return page;
+ else
+ return page->first_page;
+}
+
+static struct page *get_next_page(struct page *page)
+{
+ struct page *next;
+
+ if (is_last_page(page))
+ next = NULL;
+ else if (is_first_page(page))
+ next = (struct page *)page_private(page);
+ else
+ next = list_entry(page->lru.next, struct page, lru);
+
+ return next;
+}
+
+/*
+ * Encode <page, obj_idx> as a single handle value.
+ * On hardware platforms with physical memory starting at 0x0 the pfn
+ * could be 0 so we ensure that the handle will never be 0 by adjusting the
+ * encoded obj_idx value before encoding.
+ */
+static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+{
+ unsigned long handle;
+
+ if (!page) {
+ BUG_ON(obj_idx);
+ return NULL;
+ }
+
+ handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+ handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+
+ return (void *)handle;
+}
+
+/*
+ * Decode <page, obj_idx> pair from the given object handle. We adjust the
+ * decoded obj_idx back to its original value since it was adjusted in
+ * obj_location_to_handle().
+ */
+static void obj_handle_to_location(unsigned long handle, struct page **page,
+ unsigned long *obj_idx)
+{
+ *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+ *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+}
+
+static unsigned long obj_idx_to_offset(struct page *page,
+ unsigned long obj_idx, int class_size)
+{
+ unsigned long off = 0;
+
+ if (!is_first_page(page))
+ off = page->index;
+
+ return off + obj_idx * class_size;
+}
+
+static void reset_page(struct page *page)
+{
+ clear_bit(PG_private, &page->flags);
+ clear_bit(PG_private_2, &page->flags);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ page->freelist = NULL;
+ page_mapcount_reset(page);
+}
+
+static void free_zspage(struct page *first_page)
+{
+ struct page *nextp, *tmp, *head_extra;
+
+ BUG_ON(!is_first_page(first_page));
+ BUG_ON(first_page->inuse);
+
+ head_extra = (struct page *)page_private(first_page);
+
+ reset_page(first_page);
+ __free_page(first_page);
+
+ /* zspage with only 1 system page */
+ if (!head_extra)
+ return;
+
+ list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
+ list_del(&nextp->lru);
+ reset_page(nextp);
+ __free_page(nextp);
+ }
+ reset_page(head_extra);
+ __free_page(head_extra);
+}
+
+/* Initialize a newly allocated zspage */
+static void init_zspage(struct page *first_page, struct size_class *class)
+{
+ unsigned long off = 0;
+ struct page *page = first_page;
+
+ BUG_ON(!is_first_page(first_page));
+ while (page) {
+ struct page *next_page;
+ struct link_free *link;
+ unsigned int i, objs_on_page;
+
+ /*
+ * page->index stores offset of first object starting
+ * in the page. For the first page, this is always 0,
+ * so we use first_page->index (aka ->freelist) to store
+ * head of corresponding zspage's freelist.
+ */
+ if (page != first_page)
+ page->index = off;
+
+ link = (struct link_free *)kmap_atomic(page) +
+ off / sizeof(*link);
+ objs_on_page = (PAGE_SIZE - off) / class->size;
+
+ for (i = 1; i <= objs_on_page; i++) {
+ off += class->size;
+ if (off < PAGE_SIZE) {
+ link->next = obj_location_to_handle(page, i);
+ link += class->size / sizeof(*link);
+ }
+ }
+
+ /*
+ * We now come to the last (full or partial) object on this
+ * page, which must point to the first object on the next
+ * page (if present)
+ */
+ next_page = get_next_page(page);
+ link->next = obj_location_to_handle(next_page, 0);
+ kunmap_atomic(link);
+ page = next_page;
+ off = (off + class->size) % PAGE_SIZE;
+ }
+}
+
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+{
+ int i, error;
+ struct page *first_page = NULL, *uninitialized_var(prev_page);
+
+ /*
+ * Allocate individual pages and link them together as:
+ * 1. first page->private = first sub-page
+ * 2. all sub-pages are linked together using page->lru
+ * 3. each sub-page is linked to the first page using page->first_page
+ *
+ * For each size class, First/Head pages are linked together using
+ * page->lru. Also, we set PG_private to identify the first page
+ * (i.e. no other sub-page has this flag set) and PG_private_2 to
+ * identify the last page.
+ */
+ error = -ENOMEM;
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
+
+ page = alloc_page(flags);
+ if (!page)
+ goto cleanup;
+
+ INIT_LIST_HEAD(&page->lru);
+ if (i == 0) { /* first page */
+ SetPagePrivate(page);
+ set_page_private(page, 0);
+ first_page = page;
+ first_page->inuse = 0;
+ }
+ if (i == 1)
+ set_page_private(first_page, (unsigned long)page);
+ if (i >= 1)
+ page->first_page = first_page;
+ if (i >= 2)
+ list_add(&page->lru, &prev_page->lru);
+ if (i == class->pages_per_zspage - 1) /* last page */
+ SetPagePrivate2(page);
+ prev_page = page;
+ }
+
+ init_zspage(first_page, class);
+
+ first_page->freelist = obj_location_to_handle(first_page, 0);
+ /* Maximum number of objects we can store in this zspage */
+ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+
+ error = 0; /* Success */
+
+cleanup:
+ if (unlikely(error) && first_page) {
+ free_zspage(first_page);
+ first_page = NULL;
+ }
+
+ return first_page;
+}
+
+static struct page *find_get_zspage(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page)
+ break;
+ }
+
+ return page;
+}
+
+#ifdef CONFIG_PGTABLE_MAPPING
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm)
+ return 0;
+ area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ if (!area->vm)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm)
+ free_vm_area(area->vm);
+ area->vm = NULL;
+}
+
+static inline void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
+ area->vm_addr = area->vm->addr;
+ return area->vm_addr + off;
+}
+
+static inline void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ unsigned long addr = (unsigned long)area->vm_addr;
+
+ unmap_kernel_range(addr, PAGE_SIZE * 2);
+}
+
+#else /* CONFIG_PGTABLE_MAPPING */
+
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm_buf)
+ return 0;
+ area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ if (!area->vm_buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm_buf)
+ free_page((unsigned long)area->vm_buf);
+ area->vm_buf = NULL;
+}
+
+static void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* disable page faults to match kmap_atomic() return conditions */
+ pagefault_disable();
+
+ /* no read fastpath */
+ if (area->vm_mm == ZS_MM_WO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy object to per-cpu buffer */
+ addr = kmap_atomic(pages[0]);
+ memcpy(buf, addr + off, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(buf + sizes[0], addr, sizes[1]);
+ kunmap_atomic(addr);
+out:
+ return area->vm_buf;
+}
+
+static void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* no write fastpath */
+ if (area->vm_mm == ZS_MM_RO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy per-cpu buffer to object */
+ addr = kmap_atomic(pages[0]);
+ memcpy(addr + off, buf, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(addr, buf + sizes[0], sizes[1]);
+ kunmap_atomic(addr);
+
+out:
+ /* enable page faults to match kunmap_atomic() return conditions */
+ pagefault_enable();
+}
+
+#endif /* CONFIG_PGTABLE_MAPPING */
+
+static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
+ void *pcpu)
+{
+ int ret, cpu = (long)pcpu;
+ struct mapping_area *area;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ area = &per_cpu(zs_map_area, cpu);
+ ret = __zs_cpu_up(area);
+ if (ret)
+ return notifier_from_errno(ret);
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block zs_cpu_nb = {
+ .notifier_call = zs_cpu_notifier
+};
+
+static void zs_exit(void)
+{
+ int cpu;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu)
+ zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
+ __unregister_cpu_notifier(&zs_cpu_nb);
+
+ cpu_notifier_register_done();
+}
+
+static int zs_init(void)
+{
+ int cpu, ret;
+
+ cpu_notifier_register_begin();
+
+ __register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret)) {
+ cpu_notifier_register_done();
+ goto fail;
+ }
+ }
+
+ cpu_notifier_register_done();
+
+ return 0;
+fail:
+ zs_exit();
+ return notifier_to_errno(ret);
+}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(gfp_t flags)
+{
+ int i, ovhd_size;
+ struct zs_pool *pool;
+
+ ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+ pool = kzalloc(ovhd_size, GFP_KERNEL);
+ if (!pool)
+ return NULL;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int size;
+ struct size_class *class;
+
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+
+ class = &pool->size_class[i];
+ class->size = size;
+ class->index = i;
+ spin_lock_init(&class->lock);
+ class->pages_per_zspage = get_pages_per_zspage(size);
+
+ }
+
+ pool->flags = flags;
+
+ return pool;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+
+void zs_destroy_pool(struct zs_pool *pool)
+{
+ int i;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int fg;
+ struct size_class *class = &pool->size_class[i];
+
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ }
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+{
+ unsigned long obj;
+ struct link_free *link;
+ int class_idx;
+ struct size_class *class;
+
+ struct page *first_page, *m_page;
+ unsigned long m_objidx, m_offset;
+
+ if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+ return 0;
+
+ class_idx = get_size_class_index(size);
+ class = &pool->size_class[class_idx];
+ BUG_ON(class_idx != class->index);
+
+ spin_lock(&class->lock);
+ first_page = find_get_zspage(class);
+
+ if (!first_page) {
+ spin_unlock(&class->lock);
+ first_page = alloc_zspage(class, pool->flags);
+ if (unlikely(!first_page))
+ return 0;
+
+ set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+ spin_lock(&class->lock);
+ class->pages_allocated += class->pages_per_zspage;
+ }
+
+ obj = (unsigned long)first_page->freelist;
+ obj_handle_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ link = (struct link_free *)kmap_atomic(m_page) +
+ m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ memset(link, POISON_INUSE, sizeof(*link));
+ kunmap_atomic(link);
+
+ first_page->inuse++;
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(pool, first_page);
+ spin_unlock(&class->lock);
+
+ return obj;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+
+void zs_free(struct zs_pool *pool, unsigned long obj)
+{
+ struct link_free *link;
+ struct page *first_page, *f_page;
+ unsigned long f_objidx, f_offset;
+
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!obj))
+ return;
+
+ obj_handle_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = &pool->size_class[class_idx];
+ f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+
+ spin_lock(&class->lock);
+
+ /* Insert this object in containing zspage's freelist */
+ link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
+ + f_offset);
+ link->next = first_page->freelist;
+ kunmap_atomic(link);
+ first_page->freelist = (void *)obj;
+
+ first_page->inuse--;
+ fullness = fix_fullness_group(pool, first_page);
+
+ if (fullness == ZS_EMPTY)
+ class->pages_allocated -= class->pages_per_zspage;
+
+ spin_unlock(&class->lock);
+
+ if (fullness == ZS_EMPTY)
+ free_zspage(first_page);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+/**
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
+ *
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
+ *
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
+ */
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+
+ BUG_ON(!handle);
+
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ return area->vm_addr + off;
+ }
+
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ return __zs_map_object(area, pages, off, class->size);
+}
+EXPORT_SYMBOL_GPL(zs_map_object);
+
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+
+ BUG_ON(!handle);
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = this_cpu_ptr(&zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
+ }
+ put_cpu_var(zs_map_area);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+u64 zs_get_total_size_bytes(struct zs_pool *pool)
+{
+ int i;
+ u64 npages = 0;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++)
+ npages += pool->size_class[i].pages_allocated;
+
+ return npages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+
+module_init(zs_init);
+module_exit(zs_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 00000000000..008388fe7b0
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,940 @@
+/*
+ * zswap.c - zswap driver file
+ *
+ * zswap is a backend for frontswap that takes pages that are in the process
+ * of being swapped out and attempts to compress and store them in a
+ * RAM-based memory pool. This can result in a significant I/O reduction on
+ * the swap device and, in the case where decompressing from RAM is faster
+ * than reading from the swap device, can also improve workload performance.
+ *
+ * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/frontswap.h>
+#include <linux/rbtree.h>
+#include <linux/swap.h>
+#include <linux/crypto.h>
+#include <linux/mempool.h>
+#include <linux/zbud.h>
+
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
+#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+
+/*********************************
+* statistics
+**********************************/
+/* Number of memory pages used by the compressed pool */
+static u64 zswap_pool_pages;
+/* The number of compressed pages currently stored in zswap */
+static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+
+/*
+ * The statistics below are not protected from concurrent access for
+ * performance reasons so they may not be a 100% accurate. However,
+ * they do provide useful information on roughly how many times a
+ * certain event is occurring.
+*/
+
+/* Pool limit was hit (see zswap_max_pool_percent) */
+static u64 zswap_pool_limit_hit;
+/* Pages written back when pool limit was reached */
+static u64 zswap_written_back_pages;
+/* Store failed due to a reclaim failure after pool limit was reached */
+static u64 zswap_reject_reclaim_fail;
+/* Compressed page was too big for the allocator to (optimally) store */
+static u64 zswap_reject_compress_poor;
+/* Store failed because underlying allocator could not get memory */
+static u64 zswap_reject_alloc_fail;
+/* Store failed because the entry metadata could not be allocated (rare) */
+static u64 zswap_reject_kmemcache_fail;
+/* Duplicate store was encountered (rare) */
+static u64 zswap_duplicate_entry;
+
+/*********************************
+* tunables
+**********************************/
+/* Enable/disable zswap (disabled by default, fixed at boot for now) */
+static bool zswap_enabled __read_mostly;
+module_param_named(enabled, zswap_enabled, bool, 0444);
+
+/* Compressor to be used by zswap (fixed at boot for now) */
+#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
+static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+module_param_named(compressor, zswap_compressor, charp, 0444);
+
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent,
+ zswap_max_pool_percent, uint, 0644);
+
+/* zbud_pool is shared by all of zswap backend */
+static struct zbud_pool *zswap_pool;
+
+/*********************************
+* compression functions
+**********************************/
+/* per-cpu compression transforms */
+static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
+
+enum comp_op {
+ ZSWAP_COMPOP_COMPRESS,
+ ZSWAP_COMPOP_DECOMPRESS
+};
+
+static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen)
+{
+ struct crypto_comp *tfm;
+ int ret;
+
+ tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
+ switch (op) {
+ case ZSWAP_COMPOP_COMPRESS:
+ ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
+ break;
+ case ZSWAP_COMPOP_DECOMPRESS:
+ ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ put_cpu();
+ return ret;
+}
+
+static int __init zswap_comp_init(void)
+{
+ if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+ pr_info("%s compressor not available\n", zswap_compressor);
+ /* fall back to default compressor */
+ zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
+ if (!crypto_has_comp(zswap_compressor, 0, 0))
+ /* can't even load the default compressor */
+ return -ENODEV;
+ }
+ pr_info("using %s compressor\n", zswap_compressor);
+
+ /* alloc percpu transforms */
+ zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
+ if (!zswap_comp_pcpu_tfms)
+ return -ENOMEM;
+ return 0;
+}
+
+static void zswap_comp_exit(void)
+{
+ /* free percpu transforms */
+ if (zswap_comp_pcpu_tfms)
+ free_percpu(zswap_comp_pcpu_tfms);
+}
+
+/*********************************
+* data structures
+**********************************/
+/*
+ * struct zswap_entry
+ *
+ * This structure contains the metadata for tracking a single compressed
+ * page within zswap.
+ *
+ * rbnode - links the entry into red-black tree for the appropriate swap type
+ * refcount - the number of outstanding reference to the entry. This is needed
+ * to protect against premature freeing of the entry by code
+ * concurrent calls to load, invalidate, and writeback. The lock
+ * for the zswap_tree structure that contains the entry must
+ * be held while changing the refcount. Since the lock must
+ * be held, there is no reason to also make refcount atomic.
+ * offset - the swap offset for the entry. Index into the red-black tree.
+ * handle - zbud allocation handle that stores the compressed page data
+ * length - the length in bytes of the compressed page data. Needed during
+ * decompression
+ */
+struct zswap_entry {
+ struct rb_node rbnode;
+ pgoff_t offset;
+ int refcount;
+ unsigned int length;
+ unsigned long handle;
+};
+
+struct zswap_header {
+ swp_entry_t swpentry;
+};
+
+/*
+ * The tree lock in the zswap_tree struct protects a few things:
+ * - the rbtree
+ * - the refcount field of each entry in the tree
+ */
+struct zswap_tree {
+ struct rb_root rbroot;
+ spinlock_t lock;
+};
+
+static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+
+/*********************************
+* zswap entry functions
+**********************************/
+static struct kmem_cache *zswap_entry_cache;
+
+static int zswap_entry_cache_create(void)
+{
+ zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
+ return zswap_entry_cache == NULL;
+}
+
+static void zswap_entry_cache_destory(void)
+{
+ kmem_cache_destroy(zswap_entry_cache);
+}
+
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+{
+ struct zswap_entry *entry;
+ entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+ if (!entry)
+ return NULL;
+ entry->refcount = 1;
+ RB_CLEAR_NODE(&entry->rbnode);
+ return entry;
+}
+
+static void zswap_entry_cache_free(struct zswap_entry *entry)
+{
+ kmem_cache_free(zswap_entry_cache, entry);
+}
+
+/*********************************
+* rbtree functions
+**********************************/
+static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct zswap_entry *entry;
+
+ while (node) {
+ entry = rb_entry(node, struct zswap_entry, rbnode);
+ if (entry->offset > offset)
+ node = node->rb_left;
+ else if (entry->offset < offset)
+ node = node->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * In the case that a entry with the same offset is found, a pointer to
+ * the existing entry is stored in dupentry and the function returns -EEXIST
+ */
+static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
+ struct zswap_entry **dupentry)
+{
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ struct zswap_entry *myentry;
+
+ while (*link) {
+ parent = *link;
+ myentry = rb_entry(parent, struct zswap_entry, rbnode);
+ if (myentry->offset > entry->offset)
+ link = &(*link)->rb_left;
+ else if (myentry->offset < entry->offset)
+ link = &(*link)->rb_right;
+ else {
+ *dupentry = myentry;
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&entry->rbnode, parent, link);
+ rb_insert_color(&entry->rbnode, root);
+ return 0;
+}
+
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+ if (!RB_EMPTY_NODE(&entry->rbnode)) {
+ rb_erase(&entry->rbnode, root);
+ RB_CLEAR_NODE(&entry->rbnode);
+ }
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zbud allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_entry *entry)
+{
+ zbud_free(zswap_pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
+}
+
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+ entry->refcount++;
+}
+
+/* caller must hold the tree lock
+* remove from the tree and free it, if nobody reference the entry
+*/
+static void zswap_entry_put(struct zswap_tree *tree,
+ struct zswap_entry *entry)
+{
+ int refcount = --entry->refcount;
+
+ BUG_ON(refcount < 0);
+ if (refcount == 0) {
+ zswap_rb_erase(&tree->rbroot, entry);
+ zswap_free_entry(entry);
+ }
+}
+
+/* caller must hold the tree lock */
+static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+ pgoff_t offset)
+{
+ struct zswap_entry *entry = NULL;
+
+ entry = zswap_rb_search(root, offset);
+ if (entry)
+ zswap_entry_get(entry);
+
+ return entry;
+}
+
+/*********************************
+* per-cpu code
+**********************************/
+static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+
+static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+{
+ struct crypto_comp *tfm;
+ u8 *dst;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_err("can't allocate compressor transform\n");
+ return NOTIFY_BAD;
+ }
+ *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
+ dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
+ if (!dst) {
+ pr_err("can't allocate compressor buffer\n");
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
+ return NOTIFY_BAD;
+ }
+ per_cpu(zswap_dstmem, cpu) = dst;
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
+ if (tfm) {
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
+ }
+ dst = per_cpu(zswap_dstmem, cpu);
+ kfree(dst);
+ per_cpu(zswap_dstmem, cpu) = NULL;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int zswap_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *pcpu)
+{
+ unsigned long cpu = (unsigned long)pcpu;
+ return __zswap_cpu_notifier(action, cpu);
+}
+
+static struct notifier_block zswap_cpu_notifier_block = {
+ .notifier_call = zswap_cpu_notifier
+};
+
+static int zswap_cpu_init(void)
+{
+ unsigned long cpu;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(cpu)
+ if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+ goto cleanup;
+ __register_cpu_notifier(&zswap_cpu_notifier_block);
+ cpu_notifier_register_done();
+ return 0;
+
+cleanup:
+ for_each_online_cpu(cpu)
+ __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+ cpu_notifier_register_done();
+ return -ENOMEM;
+}
+
+/*********************************
+* helpers
+**********************************/
+static bool zswap_is_full(void)
+{
+ return totalram_pages * zswap_max_pool_percent / 100 <
+ zswap_pool_pages;
+}
+
+/*********************************
+* writeback code
+**********************************/
+/* return enum for zswap_get_swap_cache_page */
+enum zswap_get_swap_ret {
+ ZSWAP_SWAPCACHE_NEW,
+ ZSWAP_SWAPCACHE_EXIST,
+ ZSWAP_SWAPCACHE_FAIL,
+};
+
+/*
+ * zswap_get_swap_cache_page
+ *
+ * This is an adaption of read_swap_cache_async()
+ *
+ * This function tries to find a page with the given swap entry
+ * in the swapper_space address space (the swap cache). If the page
+ * is found, it is returned in retpage. Otherwise, a page is allocated,
+ * added to the swap cache, and returned in retpage.
+ *
+ * If success, the swap cache page is returned in retpage
+ * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
+ * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
+ * the new page is added to swapcache and locked
+ * Returns ZSWAP_SWAPCACHE_FAIL on error
+ */
+static int zswap_get_swap_cache_page(swp_entry_t entry,
+ struct page **retpage)
+{
+ struct page *found_page, *new_page = NULL;
+ struct address_space *swapper_space = swap_address_space(entry);
+ int err;
+
+ *retpage = NULL;
+ do {
+ /*
+ * First check the swap cache. Since this is normally
+ * called after lookup_swap_cache() failed, re-calling
+ * that would confuse statistics.
+ */
+ found_page = find_get_page(swapper_space, entry.val);
+ if (found_page)
+ break;
+
+ /*
+ * Get a new page to read into from swap.
+ */
+ if (!new_page) {
+ new_page = alloc_page(GFP_KERNEL);
+ if (!new_page)
+ break; /* Out of memory */
+ }
+
+ /*
+ * call radix_tree_preload() while we can wait.
+ */
+ err = radix_tree_preload(GFP_KERNEL);
+ if (err)
+ break;
+
+ /*
+ * Swap entry may have been freed since our caller observed it.
+ */
+ err = swapcache_prepare(entry);
+ if (err == -EEXIST) { /* seems racy */
+ radix_tree_preload_end();
+ continue;
+ }
+ if (err) { /* swp entry is obsolete ? */
+ radix_tree_preload_end();
+ break;
+ }
+
+ /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+ err = __add_to_swap_cache(new_page, entry);
+ if (likely(!err)) {
+ radix_tree_preload_end();
+ lru_cache_add_anon(new_page);
+ *retpage = new_page;
+ return ZSWAP_SWAPCACHE_NEW;
+ }
+ radix_tree_preload_end();
+ ClearPageSwapBacked(new_page);
+ __clear_page_locked(new_page);
+ /*
+ * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ * clear SWAP_HAS_CACHE flag.
+ */
+ swapcache_free(entry, NULL);
+ } while (err != -ENOMEM);
+
+ if (new_page)
+ page_cache_release(new_page);
+ if (!found_page)
+ return ZSWAP_SWAPCACHE_FAIL;
+ *retpage = found_page;
+ return ZSWAP_SWAPCACHE_EXIST;
+}
+
+/*
+ * Attempts to free an entry by adding a page to the swap cache,
+ * decompressing the entry data into the page, and issuing a
+ * bio write to write the page back to the swap device.
+ *
+ * This can be thought of as a "resumed writeback" of the page
+ * to the swap device. We are basically resuming the same swap
+ * writeback path that was intercepted with the frontswap_store()
+ * in the first place. After the page has been decompressed into
+ * the swap cache, the compressed version stored by zswap can be
+ * freed.
+ */
+static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
+{
+ struct zswap_header *zhdr;
+ swp_entry_t swpentry;
+ struct zswap_tree *tree;
+ pgoff_t offset;
+ struct zswap_entry *entry;
+ struct page *page;
+ u8 *src, *dst;
+ unsigned int dlen;
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ /* extract swpentry from data */
+ zhdr = zbud_map(pool, handle);
+ swpentry = zhdr->swpentry; /* here */
+ zbud_unmap(pool, handle);
+ tree = zswap_trees[swp_type(swpentry)];
+ offset = swp_offset(swpentry);
+
+ /* find and ref zswap entry */
+ spin_lock(&tree->lock);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was invalidated */
+ spin_unlock(&tree->lock);
+ return 0;
+ }
+ spin_unlock(&tree->lock);
+ BUG_ON(offset != entry->offset);
+
+ /* try to allocate swap cache page */
+ switch (zswap_get_swap_cache_page(swpentry, &page)) {
+ case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ ret = -ENOMEM;
+ goto fail;
+
+ case ZSWAP_SWAPCACHE_EXIST:
+ /* page is already in the swap cache, ignore for now */
+ page_cache_release(page);
+ ret = -EEXIST;
+ goto fail;
+
+ case ZSWAP_SWAPCACHE_NEW: /* page is locked */
+ /* decompress */
+ dlen = PAGE_SIZE;
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
+ sizeof(struct zswap_header);
+ dst = kmap_atomic(page);
+ ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
+ entry->length, dst, &dlen);
+ kunmap_atomic(dst);
+ zbud_unmap(zswap_pool, entry->handle);
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+
+ /* page is up to date */
+ SetPageUptodate(page);
+ }
+
+ /* move it to the tail of the inactive list after end_writeback */
+ SetPageReclaim(page);
+
+ /* start writeback */
+ __swap_writepage(page, &wbc, end_swap_bio_write);
+ page_cache_release(page);
+ zswap_written_back_pages++;
+
+ spin_lock(&tree->lock);
+ /* drop local reference */
+ zswap_entry_put(tree, entry);
+
+ /*
+ * There are two possible situations for entry here:
+ * (1) refcount is 1(normal case), entry is valid and on the tree
+ * (2) refcount is 0, entry is freed and not on the tree
+ * because invalidate happened during writeback
+ * search the tree and free the entry if find entry
+ */
+ if (entry == zswap_rb_search(&tree->rbroot, offset))
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+ goto end;
+
+ /*
+ * if we get here due to ZSWAP_SWAPCACHE_EXIST
+ * a load may happening concurrently
+ * it is safe and okay to not free the entry
+ * if we free the entry in the following put
+ * it it either okay to return !0
+ */
+fail:
+ spin_lock(&tree->lock);
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+end:
+ return ret;
+}
+
+/*********************************
+* frontswap hooks
+**********************************/
+/* attempts to compress and store an single page */
+static int zswap_frontswap_store(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry, *dupentry;
+ int ret;
+ unsigned int dlen = PAGE_SIZE, len;
+ unsigned long handle;
+ char *buf;
+ u8 *src, *dst;
+ struct zswap_header *zhdr;
+
+ if (!tree) {
+ ret = -ENODEV;
+ goto reject;
+ }
+
+ /* reclaim space if needed */
+ if (zswap_is_full()) {
+ zswap_pool_limit_hit++;
+ if (zbud_reclaim_page(zswap_pool, 8)) {
+ zswap_reject_reclaim_fail++;
+ ret = -ENOMEM;
+ goto reject;
+ }
+ }
+
+ /* allocate entry */
+ entry = zswap_entry_cache_alloc(GFP_KERNEL);
+ if (!entry) {
+ zswap_reject_kmemcache_fail++;
+ ret = -ENOMEM;
+ goto reject;
+ }
+
+ /* compress */
+ dst = get_cpu_var(zswap_dstmem);
+ src = kmap_atomic(page);
+ ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+ kunmap_atomic(src);
+ if (ret) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
+ /* store */
+ len = dlen + sizeof(struct zswap_header);
+ ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
+ &handle);
+ if (ret == -ENOSPC) {
+ zswap_reject_compress_poor++;
+ goto freepage;
+ }
+ if (ret) {
+ zswap_reject_alloc_fail++;
+ goto freepage;
+ }
+ zhdr = zbud_map(zswap_pool, handle);
+ zhdr->swpentry = swp_entry(type, offset);
+ buf = (u8 *)(zhdr + 1);
+ memcpy(buf, dst, dlen);
+ zbud_unmap(zswap_pool, handle);
+ put_cpu_var(zswap_dstmem);
+
+ /* populate entry */
+ entry->offset = offset;
+ entry->handle = handle;
+ entry->length = dlen;
+
+ /* map */
+ spin_lock(&tree->lock);
+ do {
+ ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
+ if (ret == -EEXIST) {
+ zswap_duplicate_entry++;
+ /* remove from rbtree */
+ zswap_rb_erase(&tree->rbroot, dupentry);
+ zswap_entry_put(tree, dupentry);
+ }
+ } while (ret == -EEXIST);
+ spin_unlock(&tree->lock);
+
+ /* update stats */
+ atomic_inc(&zswap_stored_pages);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
+
+ return 0;
+
+freepage:
+ put_cpu_var(zswap_dstmem);
+ zswap_entry_cache_free(entry);
+reject:
+ return ret;
+}
+
+/*
+ * returns 0 if the page was successfully decompressed
+ * return -1 on entry not found or error
+*/
+static int zswap_frontswap_load(unsigned type, pgoff_t offset,
+ struct page *page)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry;
+ u8 *src, *dst;
+ unsigned int dlen;
+ int ret;
+
+ /* find */
+ spin_lock(&tree->lock);
+ entry = zswap_entry_find_get(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was written back */
+ spin_unlock(&tree->lock);
+ return -1;
+ }
+ spin_unlock(&tree->lock);
+
+ /* decompress */
+ dlen = PAGE_SIZE;
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
+ sizeof(struct zswap_header);
+ dst = kmap_atomic(page);
+ ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
+ dst, &dlen);
+ kunmap_atomic(dst);
+ zbud_unmap(zswap_pool, entry->handle);
+ BUG_ON(ret);
+
+ spin_lock(&tree->lock);
+ zswap_entry_put(tree, entry);
+ spin_unlock(&tree->lock);
+
+ return 0;
+}
+
+/* frees an entry in zswap */
+static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry;
+
+ /* find */
+ spin_lock(&tree->lock);
+ entry = zswap_rb_search(&tree->rbroot, offset);
+ if (!entry) {
+ /* entry was written back */
+ spin_unlock(&tree->lock);
+ return;
+ }
+
+ /* remove from rbtree */
+ zswap_rb_erase(&tree->rbroot, entry);
+
+ /* drop the initial reference from entry creation */
+ zswap_entry_put(tree, entry);
+
+ spin_unlock(&tree->lock);
+}
+
+/* frees all zswap entries for the given swap type */
+static void zswap_frontswap_invalidate_area(unsigned type)
+{
+ struct zswap_tree *tree = zswap_trees[type];
+ struct zswap_entry *entry, *n;
+
+ if (!tree)
+ return;
+
+ /* walk the tree and free everything */
+ spin_lock(&tree->lock);
+ rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
+ zswap_free_entry(entry);
+ tree->rbroot = RB_ROOT;
+ spin_unlock(&tree->lock);
+ kfree(tree);
+ zswap_trees[type] = NULL;
+}
+
+static struct zbud_ops zswap_zbud_ops = {
+ .evict = zswap_writeback_entry
+};
+
+static void zswap_frontswap_init(unsigned type)
+{
+ struct zswap_tree *tree;
+
+ tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
+ tree->rbroot = RB_ROOT;
+ spin_lock_init(&tree->lock);
+ zswap_trees[type] = tree;
+}
+
+static struct frontswap_ops zswap_frontswap_ops = {
+ .store = zswap_frontswap_store,
+ .load = zswap_frontswap_load,
+ .invalidate_page = zswap_frontswap_invalidate_page,
+ .invalidate_area = zswap_frontswap_invalidate_area,
+ .init = zswap_frontswap_init
+};
+
+/*********************************
+* debugfs functions
+**********************************/
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static struct dentry *zswap_debugfs_root;
+
+static int __init zswap_debugfs_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
+ if (!zswap_debugfs_root)
+ return -ENOMEM;
+
+ debugfs_create_u64("pool_limit_hit", S_IRUGO,
+ zswap_debugfs_root, &zswap_pool_limit_hit);
+ debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_reclaim_fail);
+ debugfs_create_u64("reject_alloc_fail", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_alloc_fail);
+ debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_kmemcache_fail);
+ debugfs_create_u64("reject_compress_poor", S_IRUGO,
+ zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("written_back_pages", S_IRUGO,
+ zswap_debugfs_root, &zswap_written_back_pages);
+ debugfs_create_u64("duplicate_entry", S_IRUGO,
+ zswap_debugfs_root, &zswap_duplicate_entry);
+ debugfs_create_u64("pool_pages", S_IRUGO,
+ zswap_debugfs_root, &zswap_pool_pages);
+ debugfs_create_atomic_t("stored_pages", S_IRUGO,
+ zswap_debugfs_root, &zswap_stored_pages);
+
+ return 0;
+}
+
+static void __exit zswap_debugfs_exit(void)
+{
+ debugfs_remove_recursive(zswap_debugfs_root);
+}
+#else
+static int __init zswap_debugfs_init(void)
+{
+ return 0;
+}
+
+static void __exit zswap_debugfs_exit(void) { }
+#endif
+
+/*********************************
+* module init and exit
+**********************************/
+static int __init init_zswap(void)
+{
+ if (!zswap_enabled)
+ return 0;
+
+ pr_info("loading zswap\n");
+
+ zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+ if (!zswap_pool) {
+ pr_err("zbud pool creation failed\n");
+ goto error;
+ }
+
+ if (zswap_entry_cache_create()) {
+ pr_err("entry cache creation failed\n");
+ goto cachefail;
+ }
+ if (zswap_comp_init()) {
+ pr_err("compressor initialization failed\n");
+ goto compfail;
+ }
+ if (zswap_cpu_init()) {
+ pr_err("per-cpu initialization failed\n");
+ goto pcpufail;
+ }
+
+ frontswap_register_ops(&zswap_frontswap_ops);
+ if (zswap_debugfs_init())
+ pr_warn("debugfs initialization failed\n");
+ return 0;
+pcpufail:
+ zswap_comp_exit();
+compfail:
+ zswap_entry_cache_destory();
+cachefail:
+ zbud_destroy_pool(zswap_pool);
+error:
+ return -ENOMEM;
+}
+/* must be late so crypto has time to come up */
+late_initcall(init_zswap);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Compressed cache for swap pages");