diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/Kconfig.debug | 26 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/allocpercpu.c | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 10 | ||||
-rw-r--r-- | mm/debug-pagealloc.c | 129 | ||||
-rw-r--r-- | mm/failslab.c | 1 | ||||
-rw-r--r-- | mm/filemap.c | 23 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/highmem.c | 45 | ||||
-rw-r--r-- | mm/hugetlb.c | 6 | ||||
-rw-r--r-- | mm/internal.h | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 687 | ||||
-rw-r--r-- | mm/memory.c | 39 | ||||
-rw-r--r-- | mm/migrate.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 52 | ||||
-rw-r--r-- | mm/oom_kill.c | 13 | ||||
-rw-r--r-- | mm/page-writeback.c | 42 | ||||
-rw-r--r-- | mm/page_alloc.c | 42 | ||||
-rw-r--r-- | mm/page_cgroup.c | 37 | ||||
-rw-r--r-- | mm/pdflush.c | 49 | ||||
-rw-r--r-- | mm/quicklist.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 40 | ||||
-rw-r--r-- | mm/shmem.c | 3 | ||||
-rw-r--r-- | mm/slab.c | 78 | ||||
-rw-r--r-- | mm/slob.c | 33 | ||||
-rw-r--r-- | mm/slub.c | 78 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 27 | ||||
-rw-r--r-- | mm/truncate.c | 10 | ||||
-rw-r--r-- | mm/util.c | 46 | ||||
-rw-r--r-- | mm/vmalloc.c | 19 | ||||
-rw-r--r-- | mm/vmscan.c | 115 | ||||
-rw-r--r-- | mm/vmstat.c | 18 |
35 files changed, 1181 insertions, 532 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a5b77811fdf..b53427ad30a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -206,7 +206,6 @@ config VIRT_TO_BUS config UNEVICTABLE_LRU bool "Add LRU list to track non-evictable pages" default y - depends on MMU help Keeps unevictable pages off of the active and inactive pageout lists, so kswapd will not waste CPU time or have its balancing @@ -214,5 +213,13 @@ config UNEVICTABLE_LRU will use one page flag and increase the code size a little, say Y unless you know what you are doing. +config HAVE_MLOCK + bool + default y if MMU=y + +config HAVE_MLOCKED_PAGE_BIT + bool + default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + config MMU_NOTIFIER bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug new file mode 100644 index 00000000000..bb01e298f26 --- /dev/null +++ b/mm/Kconfig.debug @@ -0,0 +1,26 @@ +config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION || !PPC && !SPARC + ---help--- + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config WANT_PAGE_DEBUG_FLAGS + bool + +config PAGE_POISONING + bool "Debug page memory allocations" + depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC + depends on !HIBERNATION + select DEBUG_PAGEALLOC + select WANT_PAGE_DEBUG_FLAGS + help + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). This results in a large slowdown, + but helps to find certain types of memory corruptions. + + This option cannot enalbe with hibernation. Otherwise, it will get + wrong messages for memory corruption because the free pages are not + saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 818569b68f4..ec73c68b601 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_FAILSLAB) += failslab.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 1882923bc70..dfdee6a4735 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -31,7 +31,7 @@ static void percpu_depopulate(void *__pdata, int cpu) * @__pdata: per-cpu data to depopulate * @mask: depopulate per-cpu data for cpu's selected through mask bits */ -static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) { int cpu; for_each_cpu_mask_nr(cpu, *mask) @@ -143,7 +143,7 @@ void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; - __percpu_depopulate_mask(__pdata, &cpu_possible_map); + __percpu_depopulate_mask(__pdata, cpu_possible_mask); kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index be68c956a66..493b468a503 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -284,12 +284,12 @@ static wait_queue_head_t congestion_wqh[2] = { }; -void clear_bdi_congested(struct backing_dev_info *bdi, int rw) +void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; - bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + bit = sync ? BDI_sync_congested : BDI_async_congested; clear_bit(bit, &bdi->state); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) @@ -297,11 +297,11 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw) } EXPORT_SYMBOL(clear_bdi_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int rw) +void set_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; - bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; + bit = sync ? BDI_sync_congested : BDI_async_congested; set_bit(bit, &bdi->state); } EXPORT_SYMBOL(set_bdi_congested); diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c new file mode 100644 index 00000000000..a1e3324de2b --- /dev/null +++ b/mm/debug-pagealloc.c @@ -0,0 +1,129 @@ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-debug-flags.h> +#include <linux/poison.h> + +static inline void set_page_poison(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline void clear_page_poison(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static inline bool page_poison(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); +} + +static void poison_highpage(struct page *page) +{ + /* + * Page poisoning for highmem pages is not implemented. + * + * This can be called from interrupt contexts. + * So we need to create a new kmap_atomic slot for this + * application and it will need interrupt protection. + */ +} + +static void poison_page(struct page *page) +{ + void *addr; + + if (PageHighMem(page)) { + poison_highpage(page); + return; + } + set_page_poison(page); + addr = page_address(page); + memset(addr, PAGE_POISON, PAGE_SIZE); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + unsigned char *start; + unsigned char *end; + + for (start = mem; start < mem + bytes; start++) { + if (*start != PAGE_POISON) + break; + } + if (start == mem + bytes) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!printk_ratelimit()) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + printk(KERN_ERR "pagealloc: single bit error\n"); + else + printk(KERN_ERR "pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_highpage(struct page *page) +{ + /* + * See comment in poison_highpage(). + * Highmem pages should not be poisoned for now + */ + BUG_ON(page_poison(page)); +} + +static void unpoison_page(struct page *page) +{ + if (PageHighMem(page)) { + unpoison_highpage(page); + return; + } + if (page_poison(page)) { + void *addr = page_address(page); + + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + } +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} diff --git a/mm/failslab.c b/mm/failslab.c index 7c6ea6493f8..9339de5f0a9 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,4 +1,5 @@ #include <linux/fault-inject.h> +#include <linux/gfp.h> static struct { struct fault_attr attr; diff --git a/mm/filemap.c b/mm/filemap.c index 126d3973b3d..2e2d38ebda4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -513,6 +513,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, } return ret; } +EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) @@ -565,6 +566,24 @@ void wait_on_page_bit(struct page *page, int bit_nr) EXPORT_SYMBOL(wait_on_page_bit); /** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page - Page defining the wait queue of interest + * @waiter - Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + +/** * unlock_page - unlock a locked page * @page: the page * @@ -627,6 +646,7 @@ int __lock_page_killable(struct page *page) return __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page_killable, TASK_KILLABLE); } +EXPORT_SYMBOL_GPL(__lock_page_killable); /** * __lock_page_nosync - get a lock on the page, without calling sync_page() @@ -2463,6 +2483,9 @@ EXPORT_SYMBOL(generic_file_aio_write); * (presumably at page->private). If the release was successful, return `1'. * Otherwise return zero. * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * * The @gfp_mask argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0c04615651b..427dfe3ce78 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping, } } nr = nr - offset; - if (nr > len) - nr = len; + if (nr > len - copied) + nr = len - copied; error = mapping->a_ops->get_xip_mem(mapping, index, 0, &xip_mem, &xip_pfn); diff --git a/mm/highmem.c b/mm/highmem.c index 910198037bf..68eb1d9b63f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -422,3 +422,48 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ + +#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) + +void debug_kmap_atomic(enum km_type type) +{ + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +} + +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 107da3d809a..28c655ba935 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; @@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and diff --git a/mm/internal.h b/mm/internal.h index 478223b73a2..987bb03fbdd 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +#ifdef CONFIG_HAVE_MLOCK extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } +#endif #ifdef CONFIG_UNEVICTABLE_LRU /* @@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. @@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page) } } -#else /* CONFIG_UNEVICTABLE_LRU */ +#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline void free_page_mlock(struct page *page) { } -#endif /* CONFIG_UNEVICTABLE_LRU */ +#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ /* * Return the mem_map entry representing the 'offset' subpage within diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e4be9cb2a6..2fc6d6c4823 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> +#include <linux/limits.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/swap.h> @@ -95,6 +96,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, return ret; } +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) +{ + s64 ret; + + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); + return ret; +} + /* * per-zone information in memory controller. */ @@ -154,9 +164,9 @@ struct mem_cgroup { /* * While reclaiming in a hiearchy, we cache the last child we - * reclaimed from. Protected by hierarchy_mutex + * reclaimed from. */ - struct mem_cgroup *last_scanned_child; + int last_scanned_child; /* * Should the accounting and control be hierarchical, per subtree? */ @@ -247,7 +257,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) return mem_cgroup_zoneinfo(mem, nid, zid); } -static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { int nid, zid; @@ -286,6 +296,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; + + if (!mm) + return NULL; /* * Because we have no locks, mm->owner's may be being moved to other * cgroup. We use css_tryget() here even if this looks @@ -308,6 +321,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) return css_is_removed(&mem->css); } + +/* + * Call callback function against all cgroup under hierarchy tree. + */ +static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, + int (*func)(struct mem_cgroup *, void *)) +{ + int found, ret, nextid; + struct cgroup_subsys_state *css; + struct mem_cgroup *mem; + + if (!root->use_hierarchy) + return (*func)(root, data); + + nextid = 1; + do { + ret = 0; + mem = NULL; + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, + &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + rcu_read_unlock(); + + if (mem) { + ret = (*func)(mem, data); + css_put(&mem->css); + } + nextid = found + 1; + } while (!ret && css); + + return ret; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -441,31 +490,24 @@ void mem_cgroup_move_lists(struct page *page, int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; + struct mem_cgroup *curr = NULL; task_lock(task); - ret = task->mm && mm_match_cgroup(task->mm, mem); + rcu_read_lock(); + curr = try_get_mem_cgroup_from_mm(task->mm); + rcu_read_unlock(); task_unlock(task); + if (!curr) + return 0; + if (curr->use_hierarchy) + ret = css_is_ancestor(&curr->css, &mem->css); + else + ret = (curr == mem); + css_put(&curr->css); return ret; } /* - * Calculate mapped_ratio under memory controller. This will be used in - * vmscan.c for deteremining we have to reclaim mapped pages. - */ -int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) -{ - long total, rss; - - /* - * usage is recorded in bytes. But, here, we assume the number of - * physical pages can be represented by "long" on any arch. - */ - total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; - rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); - return (int)((rss * 100L) / total); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -501,8 +543,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -629,172 +671,202 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -/* - * This routine finds the DFS walk successor. This routine should be - * called with hierarchy_mutex held - */ -static struct mem_cgroup * -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) +static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) { - struct cgroup *cgroup, *curr_cgroup, *root_cgroup; - - curr_cgroup = curr->css.cgroup; - root_cgroup = root_mem->css.cgroup; + if (do_swap_account) { + if (res_counter_check_under_limit(&mem->res) && + res_counter_check_under_limit(&mem->memsw)) + return true; + } else + if (res_counter_check_under_limit(&mem->res)) + return true; + return false; +} - if (!list_empty(&curr_cgroup->children)) { - /* - * Walk down to children - */ - cgroup = list_entry(curr_cgroup->children.next, - struct cgroup, sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } +static unsigned int get_swappiness(struct mem_cgroup *memcg) +{ + struct cgroup *cgrp = memcg->css.cgroup; + unsigned int swappiness; -visit_parent: - if (curr_cgroup == root_cgroup) { - /* caller handles NULL case */ - curr = NULL; - goto done; - } + /* root ? */ + if (cgrp->parent == NULL) + return vm_swappiness; - /* - * Goto next sibling - */ - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, - sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } + spin_lock(&memcg->reclaim_param_lock); + swappiness = memcg->swappiness; + spin_unlock(&memcg->reclaim_param_lock); - /* - * Go up to next parent and next parent's sibling if need be - */ - curr_cgroup = curr_cgroup->parent; - goto visit_parent; + return swappiness; +} -done: - return curr; +static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +{ + int *val = data; + (*val)++; + return 0; } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. +/** + * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled */ -static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *root_mem) +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - struct cgroup *cgroup; - struct mem_cgroup *orig, *next; - bool obsolete; - + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; /* - * Scan all children under the mem_cgroup mem + * Need a buffer in BSS, can't rely on allocations. The code relies + * on the assumption that OOM is serialized for memory controller. + * If this assumption is broken, revisit this code. */ - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); + static char memcg_name[PATH_MAX]; + int ret; + + if (!memcg) + return; - orig = root_mem->last_scanned_child; - obsolete = mem_cgroup_is_obsolete(orig); - if (list_empty(&root_mem->css.cgroup->children)) { + rcu_read_lock(); + + mem_cgrp = memcg->css.cgroup; + task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + + ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { /* - * root_mem might have children before and last_scanned_child - * may point to one of them. We put it later. + * Unfortunately, we are unable to convert to a useful name + * But we'll still print out the usage information */ - if (orig) - VM_BUG_ON(!obsolete); - next = NULL; + rcu_read_unlock(); goto done; } + rcu_read_unlock(); - if (!orig || obsolete) { - cgroup = list_first_entry(&root_mem->css.cgroup->children, - struct cgroup, sibling); - next = mem_cgroup_from_cont(cgroup); - } else - next = __mem_cgroup_get_next_node(orig, root_mem); + printk(KERN_INFO "Task in %s killed", memcg_name); + rcu_read_lock(); + ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + + /* + * Continues from above, so we don't need an KERN_ level + */ + printk(KERN_CONT " as a result of limit of %s\n", memcg_name); done: - if (next) - mem_cgroup_get(next); - root_mem->last_scanned_child = next; - if (orig) - mem_cgroup_put(orig); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return (next) ? next : root_mem; + + printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->res, RES_FAILCNT)); + printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " + "failcnt %llu\n", + res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); } -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) +/* + * This function returns the number of memcg under hierarchy tree. Returns + * 1(self count) if no children. + */ +static int mem_cgroup_count_children(struct mem_cgroup *mem) { - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; + int num = 0; + mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + return num; } -static unsigned int get_swappiness(struct mem_cgroup *memcg) +/* + * Visit the first child (need not be the first child as per the ordering + * of the cgroup list, since we track last_scanned_child) of @mem and use + * that to reclaim free pages from. + */ +static struct mem_cgroup * +mem_cgroup_select_victim(struct mem_cgroup *root_mem) { - struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; + struct mem_cgroup *ret = NULL; + struct cgroup_subsys_state *css; + int nextid, found; - /* root ? */ - if (cgrp->parent == NULL) - return vm_swappiness; + if (!root_mem->use_hierarchy) { + css_get(&root_mem->css); + ret = root_mem; + } - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); + while (!ret) { + rcu_read_lock(); + nextid = root_mem->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + &found); + if (css && css_tryget(css)) + ret = container_of(css, struct mem_cgroup, css); + + rcu_read_unlock(); + /* Updates scanning parameter */ + spin_lock(&root_mem->reclaim_param_lock); + if (!css) { + /* this means start scan from ID:1 */ + root_mem->last_scanned_child = 0; + } else + root_mem->last_scanned_child = found; + spin_unlock(&root_mem->reclaim_param_lock); + } - return swappiness; + return ret; } |