aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c82
-rw-r--r--mm/failslab.c39
-rw-r--r--mm/filemap.c118
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memcontrol.c471
-rw-r--r--mm/memory-failure.c92
-rw-r--r--mm/mempolicy.c25
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c269
-rw-r--r--mm/page_alloc.c60
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c1493
-rw-r--r--mm/slab.c99
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c772
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/vmalloc.c17
-rw-r--r--mm/vmscan.c74
-rw-r--r--mm/vmstat.c4
23 files changed, 1879 insertions, 1793 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8290b1e8825..d6edf8d14f9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+ if (wb1 < wb2) {
+ spin_lock(&wb1->list_lock);
+ spin_lock_nested(&wb2->list_lock, 1);
+ } else {
+ spin_lock(&wb2->list_lock);
+ spin_lock_nested(&wb1->list_lock, 1);
+ }
+}
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_wb_list_lock);
+ spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
- "BdiWriteback: %8lu kB\n"
- "BdiReclaimable: %8lu kB\n"
- "BdiDirtyThresh: %8lu kB\n"
- "DirtyThresh: %8lu kB\n"
- "BackgroundThresh: %8lu kB\n"
- "b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
- "bdi_list: %8u\n"
- "state: %8lx\n",
+ "BdiWriteback: %10lu kB\n"
+ "BdiReclaimable: %10lu kB\n"
+ "BdiDirtyThresh: %10lu kB\n"
+ "DirtyThresh: %10lu kB\n"
+ "BackgroundThresh: %10lu kB\n"
+ "BdiWritten: %10lu kB\n"
+ "BdiWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "bdi_list: %10u\n"
+ "state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ K(bdi_thresh),
+ K(dirty_thresh),
+ K(background_thresh),
+ (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+ (unsigned long) K(bdi->write_bandwidth),
+ nr_dirty,
+ nr_io,
+ nr_more_io,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
return wb_has_dirty_io(&bdi->wb);
}
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .older_than_this = NULL,
- .range_cyclic = 1,
- .nr_to_write = 1024,
- };
-
- writeback_inodes_wb(&bdi->wb, &wbc);
-}
-
/*
* kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
- * the bdi from the thread.
+ * the bdi from the thread. Hopefully 1024 is
+ * large enough for efficient IO.
*/
- bdi_flush_io(bdi);
+ writeback_inodes_wb(&bdi->wb, 1024);
} else {
/*
* The spinlock makes sure we do not lose
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ spin_lock_init(&wb->list_lock);
setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
}
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
}
bdi->dirty_exceeded = 0;
+
+ bdi->bw_time_stamp = jiffies;
+ bdi->written_stamp = 0;
+
+ bdi->write_bandwidth = INIT_BW;
+ bdi->avg_write_bandwidth = INIT_BW;
+
err = prop_local_init_percpu(&bdi->completions);
if (err) {
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_wb_list_lock);
+ bdi_lock_two(&bdi->wb, dst);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_wb_list_lock);
+ spin_unlock(&bdi->wb.list_lock);
+ spin_unlock(&dst->list_lock);
}
bdi_unregister(bdi);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240dd..0dd7b8fec71 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
struct fault_attr attr;
u32 ignore_gfp_wait;
int cache_filter;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
- struct dentry *ignore_gfp_wait_file;
- struct dentry *cache_filter_file;
-#endif
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init failslab_debugfs_init(void)
{
- mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
- int err;
-
- err = init_fault_attr_dentries(&failslab.attr, "failslab");
- if (err)
- return err;
- dir = failslab.attr.dentries.dir;
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
- failslab.ignore_gfp_wait_file =
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_wait);
+ dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
- failslab.cache_filter_file =
- debugfs_create_bool("cache-filter", mode, dir,
- &failslab.cache_filter);
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter))
+ goto fail;
- if (!failslab.ignore_gfp_wait_file ||
- !failslab.cache_filter_file) {
- err = -ENOMEM;
- debugfs_remove(failslab.cache_filter_file);
- debugfs_remove(failslab.ignore_gfp_wait_file);
- cleanup_fault_attr_dentries(&failslab.attr);
- }
+ return 0;
+fail:
+ debugfs_remove_recursive(dir);
- return err;
+ return -ENOMEM;
}
late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 10a17111327..7771871fa35 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,6 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
#include <linux/cleancache.h>
#include "internal.h"
@@ -78,7 +77,7 @@
* ->i_mutex (generic_file_buffered_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
- * inode_wb_list_lock
+ * bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
@@ -96,9 +95,9 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
int error;
VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
- if (PageSwapBacked(page))
- __inc_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
{
int ret;
- /*
- * Splice_read and readahead add shmem/tmpfs pages into the page cache
- * before shmem_readpage has a chance to mark them as SwapBacked: they
- * need to go on the anon lru below, and mem_cgroup_cache_charge
- * (called in add_to_page_cache) needs to know where they're going too.
- */
- if (mapping_cap_swap_backed(mapping))
- SetPageSwapBacked(page);
-
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
- if (ret == 0) {
- if (page_is_file_cache(page))
- lru_cache_add_file(page);
- else
- lru_cache_add_anon(page);
- }
+ if (ret == 0)
+ lru_cache_add_file(page);
return ret;
}
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@ repeat:
page = radix_tree_deref_slot(pagep);
if (unlikely(!page))
goto out;
- if (radix_tree_deref_retry(page))
- goto repeat;
-
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ goto repeat;
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ goto out;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
repeat:
page = find_get_page(mapping, offset);
- if (page) {
+ if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
{
unsigned int i;
unsigned int ret;
- unsigned int nr_found;
+ unsigned int nr_found, nr_skip;
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, start, nr_pages);
+ (void ***)pages, NULL, start, nr_pages);
ret = 0;
+ nr_skip = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
repeat:
@@ -849,13 +842,23 @@ repeat:
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page)) {
- WARN_ON(start | i);
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(start | i);
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
+ */
+ nr_skip++;
+ continue;
}
if (!page_cache_get_speculative(page))
@@ -875,7 +878,7 @@ repeat:
* If all entries were removed before we could secure them,
* try again, because callers stop trying once 0 is returned.
*/
- if (unlikely(!ret && nr_found))
+ if (unlikely(!ret && nr_found > nr_skip))
goto restart;
rcu_read_unlock();
return ret;
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
restart:
nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, index, nr_pages);
+ (void ***)pages, NULL, index, nr_pages);
ret = 0;
for (i = 0; i < nr_found; i++) {
struct page *page;
@@ -912,12 +915,22 @@ repeat:
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so stop looking for
+ * contiguous pages.
+ */
+ break;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -977,12 +990,21 @@ repeat:
if (unlikely(!page))
continue;
- /*
- * This can only trigger when the entry at index 0 moves out
- * of or back to the root: none yet gotten, safe to restart.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+ /*
+ * This function is never used on a shmem/tmpfs
+ * mapping, so a swap entry won't be found here.
+ */
+ BUG();
+ }
if (!page_cache_get_speculative(page))
goto repeat;
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2e..5ef672c07f7 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -326,7 +326,7 @@ static struct page_address_slot {
spinlock_t lock; /* Protect this bucket's list */
} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
{
return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
*
* Returns the page's virtual address.
*/
-void *page_address(struct page *page)
+void *page_address(const struct page *page)
{
unsigned long flags;
void *ret;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b263..a56a851908d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/cpumask.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616f..d6880f542f9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
#include <asm/sections.h>
#include <asm/processor.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d2..3508777837c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
#include <linux/limits.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -246,10 +245,13 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- atomic_t oom_lock;
+
+ bool oom_lock;
+ atomic_t under_oom;
+
atomic_t refcnt;
- unsigned int swappiness;
+ int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
preempt_enable();
}
-static unsigned long
-mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+unsigned long
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+ unsigned int lru_mask)
{
struct mem_cgroup_per_zone *mz;
+ enum lru_list l;
+ unsigned long ret = 0;
+
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+ for_each_lru(l) {
+ if (BIT(l) & lru_mask)
+ ret += MEM_CGROUP_ZSTAT(mz, l);
+ }
+ return ret;
+}
+
+static unsigned long
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+ int nid, unsigned int lru_mask)
+{
u64 total = 0;
int zid;
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
- total += MEM_CGROUP_ZSTAT(mz, idx);
- }
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+
return total;
}
-static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
- enum lru_list idx)
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+ unsigned int lru_mask)
{
int nid;
u64 total = 0;
- for_each_online_node(nid)
- total += mem_cgroup_get_zonestat_node(mem, nid, idx);
+ for_each_node_state(nid, N_HIGH_MEMORY)
+ total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
return total;
}
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page,
mem_cgroup_add_lru_list(page, to);
}
+/*
+ * Checks whether given mem is same or in the root_mem's
+ * hierarchy subtree
+ */
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+ struct mem_cgroup *mem)
+{
+ if (root_mem != mem) {
+ return (root_mem->use_hierarchy &&
+ css_is_ancestor(&mem->css, &root_mem->css));
+ }
+
+ return true;
+}
+
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
* enabled in "curr" and "curr" is a child of "mem" in *cgroup*
* hierarchy(even if use_hierarchy is disabled in "mem").
*/
- if (mem->use_hierarchy)
- ret = css_is_ancestor(&curr->css, &mem->css);
- else
- ret = (curr == mem);
+ ret = mem_cgroup_same_or_subtree(mem, curr);
css_put(&curr->css);
return ret;
}
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
unsigned long gb;
unsigned long inactive_ratio;
- inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
+ inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+ active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
unsigned long active;
unsigned long inactive;
- inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
- active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
+ inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+ active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
return (active > inactive);
}
-unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
- struct zone *zone,
- enum lru_list lru)
-{
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
- return MEM_CGROUP_ZSTAT(mz, lru);
-}
-
-static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
- int nid)
-{
- unsigned long ret;
-
- ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
- mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
-
- return ret;
-}
-
-static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
- int nid)
-{
- unsigned long ret;
-
- ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
- mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
- return ret;
-}
-
-#if MAX_NUMNODES > 1
-static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
-{
- u64 total = 0;
- int nid;
-
- for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
-
- return total;
-}
-
-static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
-{
- u64 total = 0;
- int nid;
-
- for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
-
- return total;
-}
-
-static unsigned long
-mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
-{
- return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
-}
-
-static unsigned long
-mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
-{
- u64 total = 0;
- int nid;
-
- for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
-
- return total;
-}
-
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid)
-{
- enum lru_list l;
- u64 total = 0;
-
- for_each_lru(l)
- total += mem_cgroup_get_zonestat_node(memcg, nid, l);
-
- return total;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
-{
- u64 total = 0;
- int nid;
-
- for_each_node_state(nid, N_HIGH_MEMORY)
- total += mem_cgroup_node_nr_lru_pages(memcg, nid);
-
- return total;
-}
-#endif /* CONFIG_NUMA */
-
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
return margin >> PAGE_SHIFT;
}
-static unsigned int get_swappiness(struct mem_cgroup *memcg)
+int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
to = mc.to;
if (!from)
goto unlock;
- if (from == mem || to == mem
- || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
- || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
- ret = true;
+
+ ret = mem_cgroup_same_or_subtree(mem, from)
+ || mem_cgroup_same_or_subtree(mem, to);
unlock:
spin_unlock(&mc.lock);
return ret;
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+ if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+ if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
return true;
return false;
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
- if (!check_soft && root_mem->memsw_is_minimum)
+ if (!check_soft && !shrink && root_mem->memsw_is_minimum)
noswap = true;
while (1) {
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/* we use swappiness of local cgroup */
if (check_soft) {
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone,
- &nr_scanned);
+ noswap, zone, &nr_scanned);
*total_scanned += nr_scanned;
} else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
- noswap, get_swappiness(victim));
+ noswap);
css_put(&victim->css);
/*
* At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
+ * Has to be called with memcg_oom_lock
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- int x, lock_count = 0;
- struct mem_cgroup *iter;
+ struct mem_cgroup *iter, *failed = NULL;
+ bool cond = true;
- for_each_mem_cgroup_tree(iter, mem) {
- x = atomic_inc_return(&iter->oom_lock);
- lock_count = max(x, lock_count);
+ for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ if (iter->oom_lock) {
+ /*
+ * this subtree of our hierarchy is already locked
+ * so we cannot give a lock.
+ */
+ failed = iter;
+ cond = false;
+ } else
+ iter->oom_lock = true;
}
- if (lock_count == 1)
+ if (!failed)
return true;
+
+ /*
+ * OK, we failed to lock the whole subtree so we have to clean up
+ * what we set up to the failing subtree
+ */
+ cond = true;
+ for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+ if (iter == failed) {
+ cond = false;
+ continue;
+ }
+ iter->oom_lock = false;
+ }
return false;
}
+/*
+ * Has to be called with memcg_oom_lock
+ */
static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
struct mem_cgroup *iter;
+ for_each_mem_cgroup_tree(iter, mem)
+ iter->oom_lock = false;
+ return 0;
+}
+
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ atomic_inc(&iter->under_oom);
+}
+
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *iter;
+
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
for_each_mem_cgroup_tree(iter, mem)
- atomic_add_unless(&iter->oom_lock, -1, 0);
- return 0;
+ atomic_add_unless(&iter->under_oom, -1, 0);
}
-
-static DEFINE_MUTEX(memcg_oom_mutex);
+static DEFINE_SPINLOCK(memcg_oom_lock);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
struct oom_wait_info {
@@ -1845,25 +1816,20 @@ struct oom_wait_info {
static int memcg_oom_wake_function(wait_queue_t *wait,
unsigned mode, int sync, void *arg)
{
- struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
+ struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+ *oom_wait_mem;
struct oom_wait_info *oom_wait_info;
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
+ oom_wait_mem = oom_wait_info->mem;
- if (oom_wait_info->mem == wake_mem)
- goto wakeup;
- /* if no hierarchy, no match */
- if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
- return 0;
/*
* Both of oom_wait_info->mem and wake_mem are stable under us.
* Then we can use css_is_ancestor without taking care of RCU.
*/
- if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
- !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
+ if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
+ && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
return 0;
-
-wakeup:
return autoremove_wake_function(wait, mode, sync, arg);
}
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (mem && atomic_read(&mem->oom_lock))
+ if (mem && atomic_read(&mem->under_oom))
memcg_wakeup_oom(mem);
}
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
+ mem_cgroup_mark_under_oom(mem);
+
/* At first, try to OOM lock hierarchy under mem.*/
- mutex_lock(&memcg_oom_mutex);
+ spin_lock(&memcg_oom_lock);
locked = mem_cgroup_oom_lock(mem);
/*
* Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
need_to_kill = false;
if (locked)
mem_cgroup_oom_notify(mem);
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
}
- mutex_lock(&memcg_oom_mutex);
- mem_cgroup_oom_unlock(mem);
+ spin_lock(&memcg_oom_lock);
+ if (locked)
+ mem_cgroup_oom_unlock(mem);
memcg_wakeup_oom(mem);
- mutex_unlock(&memcg_oom_mutex);
+ spin_unlock(&memcg_oom_lock);
+
+ mem_cgroup_unmark_under_oom(mem);
if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
return false;
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
}
/*
- * Tries to drain stocked charges in other cpus. This function is asynchronous
- * and just put a work per cpu for draining localy on each cpu. Caller can
- * expects some charges will be back to res_counter later but cannot wait for
- * it.
+ * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * of the hierarchy under it. sync flag says whether we should block
+ * until the work is done.
*/
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_a