aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2009-02-06 11:01:45 +1100
committerJames Morris <jmorris@namei.org>2009-02-06 11:01:45 +1100
commitcb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree7c06d8f30783115e3384721046258ce615b129c5 /mm
parent8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parentf01d1d546abb2f4028b5299092f529eefb01253a (diff)
Merge branch 'master' into next
Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c56
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c1898
-rw-r--r--mm/memory.c234
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c24
-rw-r--r--mm/migrate.c139
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mlock.c64
-rw-r--r--mm/mmap.c117
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/msync.c4
-rw-r--r--mm/nommu.c1054
-rw-r--r--mm/oom_kill.c119
-rw-r--r--mm/page-writeback.c254
-rw-r--r--mm/page_alloc.c143
-rw-r--r--mm/page_cgroup.c209
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c16
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c104
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c24
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c35
-rw-r--r--mm/swapfile.c607
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c57
-rw-r--r--mm/vmscan.c328
-rw-r--r--mm/vmstat.c4
40 files changed, 4061 insertions, 1854 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a81..a5b77811fdf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
example on NUMA systems to put pages nearer to the processors accessing
the page.
-config RESOURCES_64BIT
- bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
- default 64BIT
- help
- This option allows memory and IO resources to be 64 bit.
-
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7..72255be57f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o pdflush.o \
- readahead.o swap.o truncate.o vmscan.o \
+ readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
page_isolation.o mm_init.o $(mmu-y)
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b046e..8e858744413 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- long background_thresh;
- long dirty_thresh;
- long bdi_thresh;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ unsigned long bdi_thresh;
get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_prop_frac = PROP_FRAC_BASE;
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
goto err;
}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142..51a0ccf61e0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
+ bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+ bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+ align, goal, limit);
+
BUG_ON(!size);
BUG_ON(align & (align - 1));
BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
if (!bdata->node_bootmem_map)
return NULL;
- bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
- bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
- align, goal, limit);
-
min = bdata->node_min_pfn;
max = bdata->node_low_pfn;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a1da969bd98..54a0f8040af 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -24,7 +24,7 @@
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
*/
-asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
struct file *file = fget(fd);
struct address_space *mapping;
@@ -126,12 +126,26 @@ out:
fput(file);
return ret;
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
+{
+ return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
+}
+SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
+#endif
#ifdef __ARCH_WANT_SYS_FADVISE64
-asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
+SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
{
return sys_fadvise64_64(fd, offset, len, advice);
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
+{
+ return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
+}
+SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
+#endif
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f8944d1..23acefe5180 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
int ret;
struct writeback_control wbc = {
.sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
+ .nr_to_write = LONG_MAX,
.range_start = start,
.range_end = end,
};
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
- gfp_mask & ~__GFP_HIGHMEM);
+ gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
@@ -741,7 +741,14 @@ repeat:
page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ /*
+ * We want a regular kernel memory (not highmem or DMA etc)
+ * allocation for the radix tree nodes, but we need to honour
+ * the context-specific requirements the caller has asked for.
+ * GFP_RECLAIM_MASK collects those requirements.
+ */
+ err = add_to_page_cache_lru(page, mapping, index,
+ (gfp_mask & GFP_RECLAIM_MASK));
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
return NULL;
}
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
- if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+ if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
page_cache_release(page);
page = NULL;
}
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
- retval = filemap_write_and_wait(mapping);
+ retval = filemap_write_and_wait_range(mapping, pos,
+ pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
@@ -1366,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
return 0;
}
-asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
{
ssize_t ret;
struct file *file;
@@ -1385,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
}
return ret;
}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+ return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
#ifdef CONFIG_MMU
/**
@@ -1530,7 +1545,6 @@ retry_find:
/*
* Found the page and have a reference on it.
*/
- mark_page_accessed(page);
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
vmf->page = page;
return ret | VM_FAULT_LOCKED;
@@ -1766,7 +1780,7 @@ int should_remove_suid(struct dentry *dentry)
if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
kill |= ATTR_KILL_SGID;
- if (unlikely(kill && !capable(CAP_FSETID)))
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
return kill;
return 0;
@@ -2060,18 +2074,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (count != ocount)
*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
- /*
- * Unmap all mmappings of the file up-front.
- *
- * This will cause any pte dirty bits to be propagated into the
- * pageframes for the subsequent filemap_write_and_wait().
- */
write_len = iov_length(iov, *nr_segs);
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
- if (mapping_mapped(mapping))
- unmap_mapping_range(mapping, pos, write_len, 0);
- written = filemap_write_and_wait(mapping);
+ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
goto out;
@@ -2140,19 +2146,24 @@ EXPORT_SYMBOL(generic_file_direct_write);
* Find or create a page at the given pagecache position. Return the locked
* page. This function is specifically for buffered writes.
*/
-struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
{
int status;
struct page *page;
+ gfp_t gfp_notmask = 0;
+ if (flags & AOP_FLAG_NOFS)
+ gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
if (likely(page))
return page;
- page = page_cache_alloc(mapping);
+ page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
if (!page)
return NULL;
- status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+ status = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & ~gfp_notmask);
if (unlikely(status)) {
page_cache_release(page);
if (status == -EEXIST)
@@ -2161,7 +2172,7 @@ repeat:
}
return page;
}
-EXPORT_SYMBOL(__grab_cache_page);
+EXPORT_SYMBOL(grab_cache_page_write_begin);
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
@@ -2286,7 +2297,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
- status = filemap_write_and_wait(mapping);
+ status = filemap_write_and_wait_range(mapping,
+ pos, pos + written - 1);
return written ? written : status;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2..0c04615651b 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7..736ba7f3306 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page, vma);
+ page_remove_rmap(page);
page_cache_release(page);
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss);
@@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
* and the vma's default protection is used. Arbitrary protections
* might be implemented in the future.
*/
-asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
- unsigned long prot, unsigned long pgoff, unsigned long flags)
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb8..618e9830408 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
}
/*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+ struct hstate *hstate;
+
+ if (!is_vm_hugetlb_page(vma))
+ return PAGE_SIZE;
+
+ hstate = hstate_vma(vma);
+
+ return 1UL << (hstate->order + PAGE_SHIFT);
+}
+
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+ return vma_kernel_pagesize(vma);
+}
+#endif
+
+/*
* Flags for MAP_PRIVATE reservations. These are stored in the bottom
* bits of the reservation map pointer, which are always clear due to
* alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
{
int i;
- if (unlikely(sz > MAX_ORDER_NR_PAGES))
- return clear_gigantic_page(page, addr, sz);
+ if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, sz);
+ return;
+ }
might_sleep();
for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
int i;
struct hstate *h = hstate_vma(vma);
- if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
- return copy_gigantic_page(dst, src, addr, vma);
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src, addr, vma);
+ return;
+ }
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
return page;
}
-__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
* puts them into the mem_map).
*/
m = addr;
- if (m)
- goto found;
+ goto found;
}
hstate_next_node(h);
nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb6..478223b73a2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
/*
* in mm/page_alloc.c
*/
+extern unsigned long highest_memmap_pfn;
extern void __free_pages_bootmem(struct page *page, unsigned int order);
/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define GUP_FLAGS_WRITE 0x1
#define GUP_FLAGS_FORCE 0x2
#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+#define GUP_FLAGS_IGNORE_SIGKILL 0x8
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int flags,
diff --git a/mm/madvise.c b/mm/madvise.c
index f9349c18a1b..b9ce574827c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
unsigned long end, tmp;
struct vm_area_struct * vma, *prev;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0..8e4be9cb2a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
@@ -34,12 +36,23 @@
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
+#include "internal.h"
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account (0)
+#endif
+
+static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
+
/*
* Statistics for memory cgroup.
*/
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu cpustat[0];
};
/*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
/*
* spin_lock to protect the per cgroup LRU
*/
- spinlock_t lru_lock;
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
+
+ struct zone_reclaim_stat reclaim_stat;
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -122,44 +136,74 @@ struct mem_cgroup {
*/
struct res_counter res;
/*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
+ /*
+ protect against reclaim related member.
+ */
+ spinlock_t reclaim_param_lock;
+
int prev_priority; /* for recording reclaim priority */
+
+ /*
+ * While reclaiming in a hiearchy, we cache the last child we
+ * reclaimed from. Protected by hierarchy_mutex
+ */
+ struct mem_cgroup *last_scanned_child;
/*
- * statistics.
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+ unsigned long last_oom_jiffies;
+ atomic_t refcnt;
+
+ unsigned int swappiness;
+
+ /*
+ * statistics. This must be placed at the end of memcg.
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
+ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
NR_CHARGE_TYPE,
};
/* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
#define PCGF_LOCK (1UL << PCG_LOCK)
-#define PCGF_FILE (1UL << PCG_FILE)
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
- PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
- PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
- PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
+ PCGF_USED | PCGF_LOCK, /* Anon */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */
};
-/*
- * Always modified under lru lock. Then, not necessary to preempt_disable()
- */
+/* for encoding cft->private value on file */
+#define _MEM (0)
+#define _MEMSWAP (1)
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
@@ -167,10 +211,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat;
+ int cpu = get_cpu();
- VM_BUG_ON(!irqs_disabled());
-
- cpustat = &stat->cpustat[smp_processor_id()];
+ cpustat = &stat->cpustat[cpu];
if (PageCgroupCache(pc))
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
else
@@ -182,6 +225,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
else
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+ put_cpu();
}
static struct mem_cgroup_per_zone *
@@ -197,6 +241,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
+ if (!mem)
+ return NULL;
+
return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -236,118 +283,169 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css);
}
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- int lru = LRU_BASE;
-
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
-
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
-
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
- list_del(&pc->lru);
+ struct mem_cgroup *mem = NULL;
+ /*
+ * Because we have no locks, mm->owner's may be being moved to other
+ * cgroup. We use css_tryget() here even if this looks
+ * pessimistic (rather than adding locks here).
+ */
+ rcu_read_lock();
+ do {
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ break;
+ } while (!css_tryget(&mem->css));
+ rcu_read_unlock();
+ return mem;
}
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
{
- int lru = LRU_BASE;
+ if (!mem)
+ return true;
+ return css_is_removed(&mem->css);
+}
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
- list_add(&pc->lru, &mz->lists[lru]);
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+ struct mem_cgroup_per_zone *mz;
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+ if (mem_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ /* can happen while we handle swapcache. */
+ if (list_empty(&pc->lru) || !pc->mem_cgroup)
+ return;
+ /*
+ * We don't check PCG_USED bit. It's cleared when the "page" is finally
+ * removed from global LRU.
+ */
+ mz = page_cgroup_zoneinfo(pc);
+ mem = pc->mem_cgroup;
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ list_del_init(&pc->lru);
+ return;
}
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_del_lru(struct page *page)
{
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
- int active = PageCgroupActive(pc);
- int file = PageCgroupFile(pc);
- int unevictable = PageCgroupUnevictable(pc);
- enum lru_list from = unevictable ? LRU_UNEVICTABLE :
- (LRU_FILE * !!file + !!active);
+ mem_cgroup_del_lru_list(page, page_lru(page));
+}
- if (lru == from)
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct page_cgroup *pc;
+
+ if (mem_cgroup_disabled())
return;
- MEM_CGROUP_ZSTAT(mz, from) -= 1;
+ pc = lookup_page_cgroup(page);