aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-11 03:43:52 +0100
committerIngo Molnar <mingo@elte.hu>2009-01-11 03:43:52 +0100
commit99cd7074891f87c49660e3b2880564324a4733ac (patch)
tree903d2665bcb445f1f265d1adf7a99f265bcefc15 /mm
parente8a9cbf6ae620d9e5ba9cb42001c033287a284a3 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into tracing/urgent
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/filemap.c32
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memcontrol.c1847
-rw-r--r--mm/memory.c204
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c131
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c32
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/nommu.c1027
-rw-r--r--mm/oom_kill.c119
-rw-r--r--mm/page-writeback.c245
-rw-r--r--mm/page_alloc.c143
-rw-r--r--mm/page_cgroup.c209
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c102
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c35
-rw-r--r--mm/swapfile.c600
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c50
-rw-r--r--mm/vmscan.c324
31 files changed, 3801 insertions, 1691 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a81..a5b77811fdf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
example on NUMA systems to put pages nearer to the processors accessing
the page.
-config RESOURCES_64BIT
- bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
- default 64BIT
- help
- This option allows memory and IO resources to be 64 bit.
-
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7..72255be57f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
- readahead.o swap.o truncate.o vmscan.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b046e..8e858744413 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- long background_thresh;
- long dirty_thresh;
- long bdi_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_prop_frac = PROP_FRAC_BASE;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
goto err;
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142..51a0ccf61e0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
BUG_ON(!size);
BUG_ON(align & (align - 1));
BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
if (!bdata->node_bootmem_map)
return NULL;
- bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
- bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
- align, goal, limit);
-
min = bdata->node_min_pfn;
max = bdata->node_low_pfn;
diff --git a/mm/filemap.c b/mm/filemap.c
index f5769b4dc07..ceba0bd0366 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
+ .nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
- gfp_mask & ~__GFP_HIGHMEM);
+ gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
@@ -741,7 +741,14 @@ repeat:
page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ /*
+ * We want a regular kernel memory (not highmem or DMA etc)
+ * allocation for the radix tree nodes, but we need to honour
+ * the context-specific requirements the caller has asked for.
+ * GFP_RECLAIM_MASK collects those requirements.
+ */
+ err = add_to_page_cache_lru(page, mapping, index,
+ (gfp_mask & GFP_RECLAIM_MASK));
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
return NULL;
}
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
page_cache_release(page);
page = NULL;
}
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
- retval = filemap_write_and_wait(mapping);
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
@@ -1530,7 +1538,6 @@ retry_find:
/*
* Found the page and have a reference on it.
*/
- mark_page_accessed(page);
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (count != ocount)
*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
- /*
- * Unmap all mmappings of the file up-front.
- *
- * This will cause any pte dirty bits to be propagated into the
- * pageframes for the subsequent filemap_write_and_wait().
- */
write_len = iov_length(iov, *nr_segs);
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, pos, write_len, 0);
- written = filemap_write_and_wait(mapping);
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
goto out;
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
- status = filemap_write_and_wait(mapping);
+ status = filemap_write_and_wait_range(mapping,
+ pos, pos + written - 1);
return written ? written : status;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2..0c04615651b 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7..62d5bbda921 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb8..618e9830408 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
}
/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate;
+
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ hstate = hstate_vma(vma);
+
+ return 1UL << (hstate->order + PAGE_SHIFT);
+}
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+#endif
+
+/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES))
- return clear_gigantic_page(page, addr, sz);
+ if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, sz);
+ return;
+ }
might_sleep();
for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
int i;
struct hstate *h = hstate_vma(vma);
- if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
- return copy_gigantic_page(dst, src, addr, vma);
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src, addr, vma);
+ return;
+ }
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
return page;
}
-__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
* puts them into the mem_map).
*/
m = addr;
- if (m)
- goto found;
+ goto found;
}
hstate_next_node(h);
nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb6..478223b73a2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
+extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define GUP_FLAGS_WRITE 0x1
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+#define GUP_FLAGS_IGNORE_SIGKILL 0x8
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0..e2996b80601 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
@@ -34,12 +36,23 @@
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
+#include "internal.h"
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account (0)
+#endif
+
+static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
+
/*
* Statistics for memory cgroup.
*/
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu cpustat[0];
};
/*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
/*
* spin_lock to protect the per cgroup LRU
*/
- spinlock_t lru_lock;
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
+
+ struct zone_reclaim_stat reclaim_stat;
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -122,44 +136,73 @@ struct mem_cgroup {
*/
struct res_counter res;
/*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
+ /*
+ protect against reclaim related member.
+ */
+ spinlock_t reclaim_param_lock;
+
int prev_priority; /* for recording reclaim priority */
+
+ /*
+ * While reclaiming in a hiearchy, we cache the last child we
+ * reclaimed from. Protected by hierarchy_mutex
+ */
+ struct mem_cgroup *last_scanned_child;
/*
- * statistics.
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+ unsigned long last_oom_jiffies;
+ atomic_t refcnt;
+
+ unsigned int swappiness;
+
+ /*
+ * statistics. This must be placed at the end of memcg.
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
+ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
NR_CHARGE_TYPE,
};
/* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
#define PCGF_LOCK (1UL << PCG_LOCK)
-#define PCGF_FILE (1UL << PCG_FILE)
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
- PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
- PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
- PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
+ PCGF_USED | PCGF_LOCK, /* Anon */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */
};
-/*
- * Always modified under lru lock. Then, not necessary to preempt_disable()
- */
+/* for encoding cft->private value on file */
+#define _MEM (0)
+#define _MEMSWAP (1)
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
@@ -167,10 +210,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat;
+ int cpu = get_cpu();
- VM_BUG_ON(!irqs_disabled());
-
- cpustat = &stat->cpustat[smp_processor_id()];
+ cpustat = &stat->cpustat[cpu];
if (PageCgroupCache(pc))
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
else
@@ -182,6 +224,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
else
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+ put_cpu();
}
static struct mem_cgroup_per_zone *
@@ -197,6 +240,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
+ if (!mem)
+ return NULL;
+
return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -236,77 +282,152 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css);
}
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- int lru = LRU_BASE;
+ struct mem_cgroup *mem = NULL;
+ /*
+ * Because we have no locks, mm->owner's may be being moved to other
+ * cgroup. We use css_tryget() here even if this looks
+ * pessimistic (rather than adding locks here).
+ */
+ rcu_read_lock();
+ do {
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ break;
+ } while (!css_tryget(&mem->css));
+ rcu_read_unlock();
+ return mem;
+}
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
+{
+ if (!mem)
+ return true;
+ return css_is_removed(&mem->css);
+}
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
- list_del(&pc->lru);
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+ struct mem_cgroup_per_zone *mz;
+
+ if (mem_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ /* can happen while we handle swapcache. */
+ if (list_empty(&pc->lru) || !pc->mem_cgroup)
+ return;
+ /*
+ * We don't check PCG_USED bit. It's cleared when the "page" is finally
+ * removed from global LRU.
+ */
+ mz = page_cgroup_zoneinfo(pc);
+ mem = pc->mem_cgroup;
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ list_del_init(&pc->lru);
+ return;
}
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+void mem_cgroup_del_lru(struct page *page)
{
- int lru = LRU_BASE;
+ mem_cgroup_del_lru_list(page, page_lru(page));
+}
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct page_cgroup *pc;
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
- list_add(&pc->lru, &mz->lists[lru]);
+ if (mem_cgroup_disabled())
+ return;
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+ pc = lookup_page_cgroup(page);
+ smp_rmb();
+ /* unused page is not rotated. */
+ if (!PageCgroupUsed(pc))
+ return;
+ mz = page_cgroup_zoneinfo(pc);
+ list_move(&pc->lru, &mz->lists[lru]);
}
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
{
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
- int active = PageCgroupActive(pc);
- int file = PageCgroupFile(pc);
- int unevictable = PageCgroupUnevictable(pc);
- enum lru_list from = unevictable ? LRU_UNEVICTABLE :
- (LRU_FILE * !!file + !!active);
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
- if (lru == from)
+ if (mem_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ /* barrier to sync with "charge" */
+ smp_rmb();
+ if (!PageCgroupUsed(pc))
return;
- MEM_CGROUP_ZSTAT(mz, from) -= 1;
+ mz = page_cgroup_zoneinfo(pc);
+ MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ list_add(&pc->lru, &mz->lists[lru]);
+}
+
+/*
+ * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * lru because the page may.be reused after it's fully uncharged (because of
+ * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * it again. This function is only used to charge SwapCache. It's done under
+ * lock_page and expected that zone->lru_lock is never held.
+ */
+static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
/*
- * However this is done under mz->lru_lock, another flags, which
- * are not related to LRU, will be modified from out-of-lock.
- * We have to use atomic set/clear flags.
+ * Forget old LRU when this page_cgroup is *not* used. This Used bit
+ * is guarded by lock_page() because the page is SwapCache.
*/
- if (is_unevictable_lru(lru)) {
- ClearPageCgroupActive(pc);
- SetPageCgroupUnevictable(pc);
- } else {
- if (is_active_lru(lru))
- SetPageCgroupActive(pc);
- else
- ClearPageCgroupActive(pc);
- ClearPageCgroupUnevictable(pc);
- }
+ if (!PageCgroupUsed(pc))
+ mem_cgroup_del_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
- list_move(&pc->lru, &mz->lists[lru]);
+static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ /* link when the page is linked to LRU but page_cgroup isn't */
+ if (PageLRU(page) && list_empty(&pc->lru))
+ mem_cgroup_add_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
+
+void mem_cgroup_move_lists(struct page *page,
+ enum lru_list from, enum lru_list to)
+{
+ if (mem_cgroup_disabled())
+ return;
+ mem_cgroup_del_lru_list(page, from);
+ mem_cgroup_add_lru_list(page, to);
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -320,37 +441,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
}
/*
- * This routine assumes that the appropriate zone's lru lock is already held
- */
-void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
- unsigned long flags;
-
- if (mem_cgroup_subsys.disabled)
- return;
-
- /*
- * We cannot lock_page_cgroup while holding zone's lru_lock,
- * because other holders of lock_page_cgroup can be interrupted
- * with an attempt to rotate_reclaimable_page. But we cannot
- * safely get to page_cgroup without it, so just try_lock it:
- * mem_cgroup_isolate_pages allows for page left on wrong list.
- */
- pc = lookup_page_cgroup(page);
- if (!trylock_page_cgroup(pc))
- return;
- if (pc && PageCgroupUsed(pc)) {
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_move_lists(pc, lru);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
- }
- unlock_page_cgroup(pc);
-}
-
-/*
* Calculate mapped_ratio under memory controller. This will be used in
* vmscan.c for deteremining we have to reclaim mapped pages.
*/
@@ -372,39 +462,108 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
*/
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
- return mem->prev_priority;
+ int prev_priority;
+
+ spin_lock(&mem->reclaim_param_lock);
+ prev_priority = mem->prev_priority;
+ spin_unlock(&mem->reclaim_param_lock);
+
+ return prev_priority;
}
void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
if (priority < mem->prev_priority)
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
-/*
- * Calculate # of pages to be scanned in this priority/zone.
- * See also vmscan.c
- *
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
- * (see include/linux/mmzone.h)
- */
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long gb;
+ unsigned long inactive_ratio;
+
+ inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ if (present_pages) {
+ present_pages[0] = inactive;
+ present_pages[1] = active;
+ }
+
+ return inactive_ratio;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long present_pages[2];
+ unsigned long inactive_ratio;
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
- int priority, enum lru_list lru)
+ inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+
+ inactive = present_pages[0];
+ active = present_pages[1];
+
+ if (inactive * inactive_ratio < active)
+ return 1;
+
+ return 0;
+}
+
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+ struct zone *zone,
+ enum lru_list lru)
{
- long nr_pages;
int nid = zone->zone_pgdat->node_id;
int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
- nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+ return MEM_CGROUP_ZSTAT(mz, lru);
+}
- return (nr_pages >> priority);
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+ struct zone *zone)
+{
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+ return &mz->reclaim_stat;
+}
+
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ pc = lookup_page