aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug26
-rw-r--r--mm/Makefile1
-rw-r--r--mm/allocpercpu.c4
-rw-r--r--mm/backing-dev.c10
-rw-r--r--mm/debug-pagealloc.c129
-rw-r--r--mm/failslab.c1
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/highmem.c45
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memcontrol.c687
-rw-r--r--mm/memory.c39
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c52
-rw-r--r--mm/oom_kill.c13
-rw-r--r--mm/page-writeback.c42
-rw-r--r--mm/page_alloc.c42
-rw-r--r--mm/page_cgroup.c37
-rw-r--r--mm/pdflush.c49
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/slab.c78
-rw-r--r--mm/slob.c33
-rw-r--r--mm/slub.c78
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c27
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmalloc.c19
-rw-r--r--mm/vmscan.c115
-rw-r--r--mm/vmstat.c18
35 files changed, 1181 insertions, 532 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf..b53427ad30a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
config UNEVICTABLE_LRU
bool "Add LRU list to track non-evictable pages"
default y
- depends on MMU
help
Keeps unevictable pages off of the active and inactive pageout
lists, so kswapd will not waste CPU time or have its balancing
@@ -214,5 +213,13 @@ config UNEVICTABLE_LRU
will use one page flag and increase the code size a little,
say Y unless you know what you are doing.
+config HAVE_MLOCK
+ bool
+ default y if MMU=y
+
+config HAVE_MLOCKED_PAGE_BIT
+ bool
+ default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
+
config MMU_NOTIFIER
bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 00000000000..bb01e298f26
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,26 @@
+config DEBUG_PAGEALLOC
+ bool "Debug page memory allocations"
+ depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ depends on !HIBERNATION || !PPC && !SPARC
+ ---help---
+ Unmap pages from the kernel linear mapping after free_pages().
+ This results in a large slowdown, but helps to find certain types
+ of memory corruptions.
+
+config WANT_PAGE_DEBUG_FLAGS
+ bool
+
+config PAGE_POISONING
+ bool "Debug page memory allocations"
+ depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ depends on !HIBERNATION
+ select DEBUG_PAGEALLOC
+ select WANT_PAGE_DEBUG_FLAGS
+ help
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). This results in a large slowdown,
+ but helps to find certain types of memory corruptions.
+
+ This option cannot enalbe with hibernation. Otherwise, it will get
+ wrong messages for memory corruption because the free pages are not
+ saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f4..ec73c68b601 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 1882923bc70..dfdee6a4735 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -31,7 +31,7 @@ static void percpu_depopulate(void *__pdata, int cpu)
* @__pdata: per-cpu data to depopulate
* @mask: depopulate per-cpu data for cpu's selected through mask bits
*/
-static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
{
int cpu;
for_each_cpu_mask_nr(cpu, *mask)
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
{
if (unlikely(!__pdata))
return;
- __percpu_depopulate_mask(__pdata, &cpu_possible_map);
+ __percpu_depopulate_mask(__pdata, cpu_possible_mask);
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index be68c956a66..493b468a503 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -284,12 +284,12 @@ static wait_queue_head_t congestion_wqh[2] = {
};
-void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum bdi_state bit;
- wait_queue_head_t *wqh = &congestion_wqh[rw];
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
- bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
clear_bit(bit, &bdi->state);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
@@ -297,11 +297,11 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
}
EXPORT_SYMBOL(clear_bdi_congested);
-void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum bdi_state bit;
- bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+ bit = sync ? BDI_sync_congested : BDI_async_congested;
set_bit(bit, &bdi->state);
}
EXPORT_SYMBOL(set_bdi_congested);
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 00000000000..a1e3324de2b
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/page-debug-flags.h>
+#include <linux/poison.h>
+
+static inline void set_page_poison(struct page *page)
+{
+ __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline void clear_page_poison(struct page *page)
+{
+ __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static inline bool page_poison(struct page *page)
+{
+ return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
+}
+
+static void poison_highpage(struct page *page)
+{
+ /*
+ * Page poisoning for highmem pages is not implemented.
+ *
+ * This can be called from interrupt contexts.
+ * So we need to create a new kmap_atomic slot for this
+ * application and it will need interrupt protection.
+ */
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr;
+
+ if (PageHighMem(page)) {
+ poison_highpage(page);
+ return;
+ }
+ set_page_poison(page);
+ addr = page_address(page);
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(unsigned char *mem, size_t bytes)
+{
+ unsigned char *start;
+ unsigned char *end;
+
+ for (start = mem; start < mem + bytes; start++) {
+ if (*start != PAGE_POISON)
+ break;
+ }
+ if (start == mem + bytes)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!printk_ratelimit())
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ printk(KERN_ERR "pagealloc: single bit error\n");
+ else
+ printk(KERN_ERR "pagealloc: memory corruption\n");
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ dump_stack();
+}
+
+static void unpoison_highpage(struct page *page)
+{
+ /*
+ * See comment in poison_highpage().
+ * Highmem pages should not be poisoned for now
+ */
+ BUG_ON(page_poison(page));
+}
+
+static void unpoison_page(struct page *page)
+{
+ if (PageHighMem(page)) {
+ unpoison_highpage(page);
+ return;
+ }
+ if (page_poison(page)) {
+ void *addr = page_address(page);
+
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ }
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (!debug_pagealloc_enabled)
+ return;
+
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
diff --git a/mm/failslab.c b/mm/failslab.c
index 7c6ea6493f8..9339de5f0a9 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,4 +1,5 @@
#include <linux/fault-inject.h>
+#include <linux/gfp.h>
static struct {
struct fault_attr attr;
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d..2e2d38ebda4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -513,6 +513,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
}
return ret;
}
+EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
struct page *__page_cache_alloc(gfp_t gfp)
@@ -565,6 +566,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
EXPORT_SYMBOL(wait_on_page_bit);
/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page - Page defining the wait queue of interest
+ * @waiter - Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __add_wait_queue(q, waiter);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
+/**
* unlock_page - unlock a locked page
* @page: the page
*
@@ -627,6 +646,7 @@ int __lock_page_killable(struct page *page)
return __wait_on_bit_lock(page_waitqueue(page), &wait,
sync_page_killable, TASK_KILLABLE);
}
+EXPORT_SYMBOL_GPL(__lock_page_killable);
/**
* __lock_page_nosync - get a lock on the page, without calling sync_page()
@@ -2463,6 +2483,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
* (presumably at page->private). If the release was successful, return `1'.
* Otherwise return zero.
*
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
* The @gfp_mask argument specifies whether I/O may be performed to release
* this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
*
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b..427dfe3ce78 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping,
}
}
nr = nr - offset;
- if (nr > len)
- nr = len;
+ if (nr > len - copied)
+ nr = len - copied;
error = mapping->a_ops->get_xip_mem(mapping, index, 0,
&xip_mem, &xip_pfn);
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf..68eb1d9b63f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type)
+{
+ static unsigned warn_count = 10;
+
+ if (unlikely(warn_count == 0))
+ return;
+
+ if (unlikely(in_interrupt())) {
+ if (in_irq()) {
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (!irqs_disabled()) { /* softirq */
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+ type != KM_SKB_SUNRPC_DATA &&
+ type != KM_SKB_DATA_SOFTIRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+ }
+
+ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+ if (!irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+ if (irq_count() == 0 && !irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+}
+
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a..28c655ba935 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* an instantiated the change should be committed via vma_commit_reservation.
* No action is required on failure.
*/
-static int vma_needs_reservation(struct hstate *h,
+static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
return 1;
} else {
- int err;
+ long err;
pgoff_t idx = vma_hugecache_offset(h, vma, addr);
struct resv_map *reservations = vma_resv_map(vma);
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct page *page;
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- unsigned int chg;
+ long chg;
/*
* Processes that did not create the mapping will have no reserves and
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2..987bb03fbdd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+#ifdef CONFIG_HAVE_MLOCK
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
}
+#endif
#ifdef CONFIG_UNEVICTABLE_LRU
/*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
}
#endif
-#ifdef CONFIG_UNEVICTABLE_LRU
+#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
/*
* Called only in fault path via page_evictable() for a new page
* to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
}
}
-#else /* CONFIG_UNEVICTABLE_LRU */
+#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void free_page_mlock(struct page *page) { }
-#endif /* CONFIG_UNEVICTABLE_LRU */
+#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
/*
* Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6..2fc6d6c4823 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/swap.h>
@@ -95,6 +96,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
return ret;
}
+static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
+{
+ s64 ret;
+
+ ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
+ ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
+ return ret;
+}
+
/*
* per-zone information in memory controller.
*/
@@ -154,9 +164,9 @@ struct mem_cgroup {
/*
* While reclaiming in a hiearchy, we cache the last child we
- * reclaimed from. Protected by hierarchy_mutex
+ * reclaimed from.
*/
- struct mem_cgroup *last_scanned_child;
+ int last_scanned_child;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -247,7 +257,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
return mem_cgroup_zoneinfo(mem, nid, zid);
}
-static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
int nid, zid;
@@ -286,6 +296,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *mem = NULL;
+
+ if (!mm)
+ return NULL;
/*
* Because we have no locks, mm->owner's may be being moved to other
* cgroup. We use css_tryget() here even if this looks
@@ -308,6 +321,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
return css_is_removed(&mem->css);
}
+
+/*
+ * Call callback function against all cgroup under hierarchy tree.
+ */
+static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
+ int (*func)(struct mem_cgroup *, void *))
+{
+ int found, ret, nextid;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *mem;
+
+ if (!root->use_hierarchy)
+ return (*func)(root, data);
+
+ nextid = 1;
+ do {
+ ret = 0;
+ mem = NULL;
+
+ rcu_read_lock();
+ css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
+ &found);
+ if (css && css_tryget(css))
+ mem = container_of(css, struct mem_cgroup, css);
+ rcu_read_unlock();
+
+ if (mem) {
+ ret = (*func)(mem, data);
+ css_put(&mem->css);
+ }
+ nextid = found + 1;
+ } while (!ret && css);
+
+ return ret;
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -441,31 +490,24 @@ void mem_cgroup_move_lists(struct page *page,
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
+ struct mem_cgroup *curr = NULL;
task_lock(task);
- ret = task->mm && mm_match_cgroup(task->mm, mem);
+ rcu_read_lock();
+ curr = try_get_mem_cgroup_from_mm(task->mm);
+ rcu_read_unlock();
task_unlock(task);
+ if (!curr)
+ return 0;
+ if (curr->use_hierarchy)
+ ret = css_is_ancestor(&curr->css, &mem->css);
+ else
+ ret = (curr == mem);
+ css_put(&curr->css);
return ret;
}
/*
- * Calculate mapped_ratio under memory controller. This will be used in
- * vmscan.c for deteremining we have to reclaim mapped pages.
- */
-int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
-{
- long total, rss;
-
- /*
- * usage is recorded in bytes. But, here, we assume the number of
- * physical pages can be represented by "long" on any arch.
- */
- total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
- rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
- return (int)((rss * 100L) / total);
-}
-
-/*
* prev_priority control...this will be used in memory reclaim path.
*/
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -501,8 +543,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
unsigned long gb;
unsigned long inactive_ratio;
- inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+ inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -629,172 +671,202 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
-/*
- * This routine finds the DFS walk successor. This routine should be
- * called with hierarchy_mutex held
- */
-static struct mem_cgroup *
-__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
+static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
{
- struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
-
- curr_cgroup = curr->css.cgroup;
- root_cgroup = root_mem->css.cgroup;
+ if (do_swap_account) {
+ if (res_counter_check_under_limit(&mem->res) &&
+ res_counter_check_under_limit(&mem->memsw))
+ return true;
+ } else
+ if (res_counter_check_under_limit(&mem->res))
+ return true;
+ return false;
+}
- if (!list_empty(&curr_cgroup->children)) {
- /*
- * Walk down to children
- */
- cgroup = list_entry(curr_cgroup->children.next,
- struct cgroup, sibling);
- curr = mem_cgroup_from_cont(cgroup);
- goto done;
- }
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+ struct cgroup *cgrp = memcg->css.cgroup;
+ unsigned int swappiness;
-visit_parent:
- if (curr_cgroup == root_cgroup) {
- /* caller handles NULL case */
- curr = NULL;
- goto done;
- }
+ /* root ? */
+ if (cgrp->parent == NULL)
+ return vm_swappiness;
- /*
- * Goto next sibling
- */
- if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
- cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
- sibling);
- curr = mem_cgroup_from_cont(cgroup);
- goto done;
- }
+ spin_lock(&memcg->reclaim_param_lock);
+ swappiness = memcg->swappiness;
+ spin_unlock(&memcg->reclaim_param_lock);
- /*
- * Go up to next parent and next parent's sibling if need be
- */
- curr_cgroup = curr_cgroup->parent;
- goto visit_parent;
+ return swappiness;
+}
-done:
- return curr;
+static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
+{
+ int *val = data;
+ (*val)++;
+ return 0;
}
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
+/**
+ * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * @memcg: The memory cgroup that went over limit
+ * @p: Task that is going to be killed
+ *
+ * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
+ * enabled
*/
-static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *cgroup;
- struct mem_cgroup *orig, *next;
- bool obsolete;
-
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
/*
- * Scan all children under the mem_cgroup mem
+ * Need a buffer in BSS, can't rely on allocations. The code relies
+ * on the assumption that OOM is serialized for memory controller.
+ * If this assumption is broken, revisit this code.
*/
- mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+ static char memcg_name[PATH_MAX];
+ int ret;
+
+ if (!memcg)
+ return;
- orig = root_mem->last_scanned_child;
- obsolete = mem_cgroup_is_obsolete(orig);
- if (list_empty(&root_mem->css.cgroup->children)) {
+ rcu_read_lock();
+
+ mem_cgrp = memcg->css.cgroup;
+ task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+
+ ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
+ if (ret < 0) {
/*
- * root_mem might have children before and last_scanned_child
- * may point to one of them. We put it later.
+ * Unfortunately, we are unable to convert to a useful name
+ * But we'll still print out the usage information
*/
- if (orig)
- VM_BUG_ON(!obsolete);
- next = NULL;
+ rcu_read_unlock();
goto done;
}
+ rcu_read_unlock();
- if (!orig || obsolete) {
- cgroup = list_first_entry(&root_mem->css.cgroup->children,
- struct cgroup, sibling);
- next = mem_cgroup_from_cont(cgroup);
- } else
- next = __mem_cgroup_get_next_node(orig, root_mem);
+ printk(KERN_INFO "Task in %s killed", memcg_name);
+ rcu_read_lock();
+ ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
+ if (ret < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ rcu_read_unlock();
+
+ /*
+ * Continues from above, so we don't need an KERN_ level
+ */
+ printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
done:
- if (next)
- mem_cgroup_get(next);
- root_mem->last_scanned_child = next;
- if (orig)
- mem_cgroup_put(orig);
- mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
- return (next) ? next : root_mem;
+
+ printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->res, RES_FAILCNT));
+ printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
+ "failcnt %llu\n",
+ res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
}
-static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+/*
+ * This function returns the number of memcg under hierarchy tree. Returns
+ * 1(self count) if no children.
+ */
+static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
- if (do_swap_account) {
- if (res_counter_check_under_limit(&mem->res) &&
- res_counter_check_under_limit(&mem->memsw))
- return true;
- } else
- if (res_counter_check_under_limit(&mem->res))
- return true;
- return false;
+ int num = 0;
+ mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+ return num;
}
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+/*
+ * Visit the first child (need not be the first child as per the ordering
+ * of the cgroup list, since we track last_scanned_child) of @mem and use
+ * that to reclaim free pages from.
+ */
+static struct mem_cgroup *
+mem_cgroup_select_victim(struct mem_cgroup *root_mem)
{
- struct cgroup *cgrp = memcg->css.cgroup;
- unsigned int swappiness;
+ struct mem_cgroup *ret = NULL;
+ struct cgroup_subsys_state *css;
+ int nextid, found;
- /* root ? */
- if (cgrp->parent == NULL)
- return vm_swappiness;
+ if (!root_mem->use_hierarchy) {
+ css_get(&root_mem->css);
+ ret = root_mem;
+ }
- spin_lock(&memcg->reclaim_param_lock);
- swappiness = memcg->swappiness;
- spin_unlock(&memcg->reclaim_param_lock);
+ while (!ret) {
+ rcu_read_lock();
+ nextid = root_mem->last_scanned_child + 1;
+ css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+ &found);
+ if (css && css_tryget(css))
+ ret = container_of(css, struct mem_cgroup, css);
+
+ rcu_read_unlock();
+ /* Updates scanning parameter */
+ spin_lock(&root_mem->reclaim_param_lock);
+ if (!css) {
+ /* this means start scan from ID:1 */
+ root_mem->last_scanned_child = 0;
+ } else
+ root_mem->last_scanned_child = found;
+ spin_unlock(&root_mem->reclaim_param_lock);
+ }
- return swappiness;
+ return ret;
}