aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/huge_memory.c133
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c124
-rw-r--r--mm/memcontrol.c169
-rw-r--r--mm/memory-failure.c36
-rw-r--r--mm/memory.c143
-rw-r--r--mm/memory_hotplug.c65
-rw-r--r--mm/mempolicy.c144
-rw-r--r--mm/migrate.c30
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c69
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/page_alloc.c38
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c48
-rw-r--r--mm/vmstat.c22
-rw-r--r--mm/zswap.c195
32 files changed, 834 insertions, 611 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 394838f489e..3f4ffda152b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,11 +153,18 @@ config MOVABLE_NODE
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows users to
- online all the memory of a node as movable memory so that the whole
- node can be hotplugged. Users who don't use the memory hotplug
- feature are fine with this option on since they don't online memory
- as movable.
+ memory device cannot be hotplugged. This option allows the following
+ two things:
+ - When the system is booting, node full of hotpluggable memory can
+ be arranged to have only movable memory so that the whole node can
+ be hot-removed. (need movable_node boot option specified).
+ - After the system is up, the option allows users to online all the
+ memory of a node as movable memory so that the whole node can be
+ hot-removed.
+
+ Users who don't use the memory hotplug feature are fine with this
+ option on since they don't specify movable_node boot option or they
+ don't online memory as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692..90bd3507b41 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long start, end, pages, count = 0;
+ unsigned long *map, start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
+ map = bdata->node_bootmem_map;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdata - bootmem_node_data, start, end);
while (start < end) {
- unsigned long *map, idx, vec;
+ unsigned long idx, vec;
unsigned shift;
- map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
/* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+ end_pfn = pgdat_end_pfn(pgdat);
if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
(goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/compaction.c b/mm/compaction.c
index b5326b141a2..805165bcd3d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
}
/*
- * Isolate free pages onto a private freelist. Caller must hold zone->lock.
- * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
- * pages inside of the pageblock (even though it may still end up isolating
- * some pages).
+ * Isolate free pages onto a private freelist. If @strict is true, will abort
+ * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
+ * (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long blockpfn,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cca80d96e50..0556c6a4495 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
#include "internal.h"
/*
- * By default transparent hugepage support is enabled for all mappings
- * and khugepaged scans all mappings. Defrag is only invoked by
- * khugepaged hugepage allocations and by page faults inside
- * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
- * allocations.
+ * By default transparent hugepage support is disabled in order that avoid
+ * to risk increase the memory footprint of applications without a guaranteed
+ * benefit. When transparent hugepage support is enabled, is for all mappings,
+ * and khugepaged scans all mappings.
+ * Defrag is invoked by khugepaged hugepage allocations and by page faults
+ * for all hugepage allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -758,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-#ifndef CONFIG_NUMA
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-#endif
-
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -1282,19 +1275,32 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
- int target_nid;
+ int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
+ int flags = 0;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
+ BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid)
+ if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pmd_write(pmd))
+ flags |= TNF_NO_GROUP;
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
@@ -1325,7 +1331,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
anon_vma = page_lock_anon_vma_read(page);
- /* Confirm the PTE did not while locked */
+ /* Confirm the PMD did not change while page_table_lock was released */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
@@ -1341,8 +1347,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (migrated)
+ if (migrated) {
+ flags |= TNF_MIGRATED;
page_nid = target_nid;
+ }
goto out;
clear_pmdnuma:
@@ -1360,7 +1368,7 @@ out:
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
return 0;
}
@@ -1458,6 +1466,12 @@ out:
return ret;
}
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
@@ -1466,22 +1480,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ ret = 1;
if (!prot_numa) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /* only check non-shared pages */
- if (page_mapcount(page) == 1 &&
+ /*
+ * Do not trap faults against the zero page. The
+ * read-only data is likely to be read-cached on the
+ * local CPU cache and it is less useful to know about
+ * local vs remote hits on the zero page.
+ */
+ if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_mknuma(entry);
+ ret = HPAGE_PMD_NR;
}
}
- set_pmd_at(mm, addr, pmd, entry);
+
+ /* Set PMD if cleared earlier */
+ if (ret == HPAGE_PMD_NR)
+ set_pmd_at(mm, addr, pmd, entry);
+
spin_unlock(&vma->vm_mm->page_table_lock);
- ret = 1;
}
return ret;
@@ -1662,7 +1688,7 @@ static void __split_huge_page_refcount(struct page *page,
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_nid_xchg_last(page_tail, page_nid_last(page));
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -2165,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
+static int khugepaged_node_load[MAX_NUMNODES];
+
#ifdef CONFIG_NUMA
+static int khugepaged_find_target_node(void)
+{
+ static int last_khugepaged_target_node = NUMA_NO_NODE;
+ int nid, target_node = 0, max_value = 0;
+
+ /* find first node with max normal pages hit */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ if (khugepaged_node_load[nid] > max_value) {
+ max_value = khugepaged_node_load[nid];
+ target_node = nid;
+ }
+
+ /* do some balance if several nodes have the same hit record */
+ if (target_node <= last_khugepaged_target_node)
+ for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == khugepaged_node_load[nid]) {
+ target_node = nid;
+ break;
+ }
+
+ last_khugepaged_target_node = target_node;
+ return target_node;
+}
+
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2199,9 +2252,8 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
- node, __GFP_OTHER_NODE);
-
+ *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
+ khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2217,6 +2269,17 @@ static struct page
return *hpage;
}
#else
+static int khugepaged_find_target_node(void)
+{
+ return 0;
+}
+
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2423,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2439,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Chose the node of the first page. This could
- * be more sophisticated and look at more pages,
- * but isn't for now.
+ * Record which node the original page is from and save this
+ * information to khugepaged_node_load[].
+ * Khupaged will allocate hugepage from the node has the max
+ * hit record.
*/
- if (node == NUMA_NO_NODE)
- node = page_to_nid(page);
+ node = page_to_nid(page);
+ khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2459,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
+ node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
+ }
out:
return ret;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0ef9ad..31f01c5011e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
- if (ptr + size > object->pointer + object->size) {
+ if (size == SIZE_MAX) {
+ size = object->pointer + object->size - ptr;
+ } else if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bea2b262a4..175fff79dc9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
* Allocate stable and unstable together:
* MAXSMP NODES_SHIFT 10 will use 16kB.
*/
- buf = kcalloc(nr_node_ids + nr_node_ids,
- sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
+ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
+ GFP_KERNEL);
/* Let us assume that RB_ROOT is NULL is zero */
if (!buf)
err = -ENOMEM;
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ac412a0a7e..53e477bb555 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <asm-generic/sections.h>
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
+ .bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
return (i < type->cnt) ? i : -1;
}
+/*
+ * __memblock_find_range_bottom_up - find free area utility in bottom-up
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area bottom-up.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ cand = round_up(this_start, align);
+ if (cand < this_end && this_end - cand >= size)
+ return cand;
+ }
+
+ return 0;
+}
+
+/**
+ * __memblock_find_range_top_down - find free area utility, in top-down
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Utility called from memblock_find_in_range_node(), find free area top-down.
+ *
+ * RETURNS:
+ * Found address on success, 0 on failure.
+ */
+static phys_addr_t __init_memblock
+__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align, int nid)
+{
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
+
+ for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+
+ if (this_end < size)
+ continue;
+
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
+ }
+
+ return 0;
+}
+
/**
* memblock_find_in_range_node - find free area in given range and node
* @start: start of candidate range
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*
* Find @size free area aligned to @align in the specified range and node.
*
+ * When allocation direction is bottom-up, the @start should be greater
+ * than the end of the kernel image. Otherwise, it will be trimmed. The
+ * reason is that we want the bottom-up allocation just near the kernel
+ * image so it is highly likely that the allocated memory and the kernel
+ * will reside in the same node.
+ *
+ * If bottom-up allocation failed, will try to allocate memory top-down.
+ *
* RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
{
- phys_addr_t this_start, this_end, cand;
- u64 i;
+ int ret;
+ phys_addr_t kernel_end;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
+ kernel_end = __pa_symbol(_end);
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
- this_start = clamp(this_start, start, end);
- this_end = clamp(this_end, start, end);
+ /*
+ * try bottom-up allocation only when bottom-up mode
+ * is set and @end is above the kernel image.
+ */
+ if (memblock_bottom_up() && end > kernel_end) {
+ phys_addr_t bottom_up_start;
- if (this_end < size)
- continue;
+ /* make sure we will allocate above the kernel */
+ bottom_up_start = max(start, kernel_end);
- cand = round_down(this_end - size, align);
- if (cand >= this_start)
- return cand;
+ /* ok, try bottom-up allocation first */
+ ret = __memblock_find_range_bottom_up(bottom_up_start, end,
+ size, align, nid);
+ if (ret)
+ return ret;
+
+ /*
+ * we always limit bottom-up allocation above the kernel,
+ * but top-down allocation doesn't have the limit, so
+ * retrying top-down allocation may succeed when bottom-up
+ * allocation failed.
+ *
+ * bottom-up allocation is expected to be fail very rarely,
+ * so we use WARN_ONCE() here to see the stack trace if
+ * fail happens.
+ */
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, "
+ "memory hotunplug may be affected\n");
}
- return 0;
+
+ return __memblock_find_range_top_down(start, end, size, align, nid);
}
/**
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
- * Found address on success, %0 on failure.
+ * Found address on success, 0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b9d0f221b..e3cd40b2d5d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,6 +59,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
+#include "slab.h"
#include <asm/uaccess.h>
@@ -312,7 +313,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct tcp_memcontrol tcp_mem;
+ struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
@@ -499,6 +500,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ /*
+ * The ID of the root cgroup is 0, but memcg treat 0 as an
+ * invalid ID, so we return (cgroup_id + 1).
+ */
+ return memcg->css.cgroup->id + 1;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id - 1, &mem_cgroup_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
@@ -551,13 +575,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem.cg_proto;
+ return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ if (!memcg_proto_activated(&memcg->tcp_mem))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -570,16 +594,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * There are two main reasons for not using the css_id for this:
- * 1) this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * 2) In order not to violate the cgroup API, we would like to do all memory
- * allocation in ->create(). At that point, we haven't yet allocated the
- * css_id. Having a separate index prevents us from messing with the cgroup
- * core for this
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -594,14 +613,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * css_id space is not getting any smaller, and we don't have to necessarily
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE 65535
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -1408,7 +1427,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return css_is_ancestor(&memcg->css, &root_memcg->css);
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -2826,15 +2845,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
- struct cgroup_subsys_state *css;
-
/* ID 0 is unused ID */
if (!id)
return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return mem_cgroup_from_css(css);
+ return mem_cgroup_from_id(id);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2955,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
}
#ifdef CONFIG_SLABINFO
@@ -2984,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
- bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- /*
- * Conditions under which we can wait for the oom_killer. Those are
- * the same conditions tested by the core page allocator
- */
- may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, may_oom);
+ &_memcg, oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
@@ -3138,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(s));
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3399,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
- new_cachep = cachep->memcg_params->memcg_caches[idx];
+ new_cachep = cache_from_memcg_idx(cachep, idx);
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3445,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- c = s->memcg_params->memcg_caches[i];
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
if (!c)
continue;
@@ -3579,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (likely(cachep->memcg_params->memcg_caches[idx])) {
- cachep = cachep->memcg_params->memcg_caches[idx];
+ if (likely(cache_from_memcg_idx(cachep, idx))) {
+ cachep = cache_from_memcg_idx(cachep, idx);
goto out;
}
@@ -4350,7 +4357,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, css_id(&memcg->css));
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
}
#endif
@@ -4402,8 +4409,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = css_id(&from->css);
- new_id = css_id(&to->css);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -5376,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,