diff options
author | James Morris <jmorris@namei.org> | 2009-02-06 11:01:45 +1100 |
---|---|---|
committer | James Morris <jmorris@namei.org> | 2009-02-06 11:01:45 +1100 |
commit | cb5629b10d64a8006622ce3a52bc887d91057d69 (patch) | |
tree | 7c06d8f30783115e3384721046258ce615b129c5 /mm | |
parent | 8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff) | |
parent | f01d1d546abb2f4028b5299092f529eefb01253a (diff) |
Merge branch 'master' into next
Conflicts:
fs/namei.c
Manually merged per:
diff --cc fs/namei.c
index 734f2b5,bbc15c2..0000000
--- a/fs/namei.c
+++ b/fs/namei.c
@@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
if (err == -EAGAIN)
- err = vfs_permission(nd, MAY_EXEC);
+ err = inode_permission(nd->path.dentry->d_inode,
+ MAY_EXEC);
+ if (!err)
+ err = ima_path_check(&nd->path, MAY_EXEC);
if (err)
break;
@@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc
flag &= ~O_TRUNC;
}
- error = vfs_permission(nd, acc_mode);
+ error = inode_permission(inode, acc_mode);
if (error)
return error;
+
- error = ima_path_check(&nd->path,
++ error = ima_path_check(path,
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+ if (error)
+ return error;
/*
* An append-only file must be opened in append mode for writing.
*/
Signed-off-by: James Morris <jmorris@namei.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/filemap.c | 56 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 1898 | ||||
-rw-r--r-- | mm/memory.c | 234 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/mempolicy.c | 24 | ||||
-rw-r--r-- | mm/migrate.c | 139 | ||||
-rw-r--r-- | mm/mincore.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 64 | ||||
-rw-r--r-- | mm/mmap.c | 117 | ||||
-rw-r--r-- | mm/mprotect.c | 12 | ||||
-rw-r--r-- | mm/mremap.c | 8 | ||||
-rw-r--r-- | mm/msync.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 1054 | ||||
-rw-r--r-- | mm/oom_kill.c | 119 | ||||
-rw-r--r-- | mm/page-writeback.c | 254 | ||||
-rw-r--r-- | mm/page_alloc.c | 143 | ||||
-rw-r--r-- | mm/page_cgroup.c | 209 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/pdflush.c | 16 | ||||
-rw-r--r-- | mm/rmap.c | 60 | ||||
-rw-r--r-- | mm/shmem.c | 104 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/swap.c | 77 | ||||
-rw-r--r-- | mm/swap_state.c | 35 | ||||
-rw-r--r-- | mm/swapfile.c | 607 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 134 | ||||
-rw-r--r-- | mm/vmalloc.c | 57 | ||||
-rw-r--r-- | mm/vmscan.c | 328 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
40 files changed, 4061 insertions, 1854 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 5b5790f8a81..a5b77811fdf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -181,12 +181,6 @@ config MIGRATION example on NUMA systems to put pages nearer to the processors accessing the page. -config RESOURCES_64BIT - bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) - default 64BIT - help - This option allows memory and IO resources to be 64 bit. - config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT diff --git a/mm/Makefile b/mm/Makefile index 51c27709cc7..72255be57f8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ - readahead.o swap.o truncate.o vmscan.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) @@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o -obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 801c08b046e..8e858744413 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,9 +24,9 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - long background_thresh; - long dirty_thresh; - long bdi_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_prop_frac = PROP_FRAC_BASE; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) goto err; } diff --git a/mm/bootmem.c b/mm/bootmem.c index ac5a891f142..51a0ccf61e0 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + BUG_ON(!size); BUG_ON(align & (align - 1)); BUG_ON(limit && goal + size > limit); @@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, if (!bdata->node_bootmem_map) return NULL; - bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", - bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, - align, goal, limit); - min = bdata->node_min_pfn; max = bdata->node_low_pfn; diff --git a/mm/fadvise.c b/mm/fadvise.c index a1da969bd98..54a0f8040af 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -24,7 +24,7 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; @@ -126,12 +126,26 @@ out: fput(file); return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) +{ + return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); +} +SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); +#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) { return sys_fadvise64_64(fd, offset, len, advice); } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) +{ + return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); +} +SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); +#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index f3e5f8944d1..23acefe5180 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; @@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, VM_BUG_ON(!PageLocked(page)); error = mem_cgroup_cache_charge(page, current->mm, - gfp_mask & ~__GFP_HIGHMEM); + gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; @@ -741,7 +741,14 @@ repeat: page = __page_cache_alloc(gfp_mask); if (!page) return NULL; - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + /* + * We want a regular kernel memory (not highmem or DMA etc) + * allocation for the radix tree nodes, but we need to honour + * the context-specific requirements the caller has asked for. + * GFP_RECLAIM_MASK collects those requirements. + */ + err = add_to_page_cache_lru(page, mapping, index, + (gfp_mask & GFP_RECLAIM_MASK)); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) return NULL; } page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { + if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; } @@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = filemap_write_and_wait(mapping); + retval = filemap_write_and_wait_range(mapping, pos, + pos + iov_length(iov, nr_segs) - 1); if (!retval) { retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); @@ -1366,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; @@ -1385,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif #ifdef CONFIG_MMU /** @@ -1530,7 +1545,6 @@ retry_find: /* * Found the page and have a reference on it. */ - mark_page_accessed(page); ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1766,7 +1780,7 @@ int should_remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; @@ -2060,18 +2074,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - /* - * Unmap all mmappings of the file up-front. - * - * This will cause any pte dirty bits to be propagated into the - * pageframes for the subsequent filemap_write_and_wait(). - */ write_len = iov_length(iov, *nr_segs); end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, pos, write_len, 0); - written = filemap_write_and_wait(mapping); + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) goto out; @@ -2140,19 +2146,24 @@ EXPORT_SYMBOL(generic_file_direct_write); * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) { int status; struct page *page; + gfp_t gfp_notmask = 0; + if (flags & AOP_FLAG_NOFS) + gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); if (likely(page)) return page; - page = page_cache_alloc(mapping); + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) return NULL; - status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + status = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & ~gfp_notmask); if (unlikely(status)) { page_cache_release(page); if (status == -EEXIST) @@ -2161,7 +2172,7 @@ repeat: } return page; } -EXPORT_SYMBOL(__grab_cache_page); +EXPORT_SYMBOL(grab_cache_page_write_begin); static ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) @@ -2286,7 +2297,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); + status = filemap_write_and_wait_range(mapping, + pos, pos + written - 1); return written ? written : status; } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b5167dfb2f2..0c04615651b 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -193,7 +193,7 @@ retry: /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); - page_remove_rmap(page, vma); + page_remove_rmap(page); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); diff --git a/mm/fremap.c b/mm/fremap.c index 7d12ca70ef7..736ba7f3306 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (page) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, vma); + page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); @@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ -asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53dcb8..618e9830408 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, } /* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << (hstate->order + PAGE_SHIFT); +} + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. @@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) - return clear_gigantic_page(page, addr, sz); + if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, sz); + return; + } might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { @@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src, int i; struct hstate *h = hstate_vma(vma); - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) - return copy_gigantic_page(dst, src, addr, vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src, addr, vma); + return; + } might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { @@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) +int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); @@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) * puts them into the mem_map). */ m = addr; - if (m) - goto found; + goto found; } hstate_next_node(h); nr_nodes--; diff --git a/mm/internal.h b/mm/internal.h index 13333bc2eb6..478223b73a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ +extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define GUP_FLAGS_WRITE 0x1 #define GUP_FLAGS_FORCE 0x2 #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 +#define GUP_FLAGS_IGNORE_SIGKILL 0x8 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, diff --git a/mm/madvise.c b/mm/madvise.c index f9349c18a1b..b9ce574827c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 866dcc7eeb0..8e4be9cb2a6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -21,11 +21,13 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/smp.h> #include <linux/page-flags.h> #include <linux/backing-dev.h> #include <linux/bit_spinlock.h> #include <linux/rcupdate.h> +#include <linux/mutex.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/spinlock.h> @@ -34,12 +36,23 @@ #include <linux/vmalloc.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> +#include "internal.h" #include <asm/uaccess.h> struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ +int do_swap_account __read_mostly; +static int really_do_swap_account __initdata = 1; /* for remember boot option*/ +#else +#define do_swap_account (0) +#endif + +static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ + /* * Statistics for memory cgroup. */ @@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu { } ____cacheline_aligned_in_smp; struct mem_cgroup_stat { - struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; + struct mem_cgroup_stat_cpu cpustat[0]; }; /* @@ -89,9 +102,10 @@ struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ - spinlock_t lru_lock; struct list_head lists[NR_LRU_LISTS]; unsigned long count[NR_LRU_LISTS]; + + struct zone_reclaim_stat reclaim_stat; }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -122,44 +136,74 @@ struct mem_cgroup { */ struct res_counter res; /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ struct mem_cgroup_lru_info info; + /* + protect against reclaim related member. + */ + spinlock_t reclaim_param_lock; + int prev_priority; /* for recording reclaim priority */ + + /* + * While reclaiming in a hiearchy, we cache the last child we + * reclaimed from. Protected by hierarchy_mutex + */ + struct mem_cgroup *last_scanned_child; /* - * statistics. + * Should the accounting and control be hierarchical, per subtree? + */ + bool use_hierarchy; + unsigned long last_oom_jiffies; + atomic_t refcnt; + + unsigned int swappiness; + + /* + * statistics. This must be placed at the end of memcg. */ struct mem_cgroup_stat stat; }; -static struct mem_cgroup init_mem_cgroup; enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ NR_CHARGE_TYPE, }; /* only for here (for easy reading.) */ #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) -#define PCGF_ACTIVE (1UL << PCG_ACTIVE) #define PCGF_LOCK (1UL << PCG_LOCK) -#define PCGF_FILE (1UL << PCG_FILE) static const unsigned long pcg_default_flags[NR_CHARGE_TYPE] = { - PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ - PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ - PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 0, /* FORCE */ }; -/* - * Always modified under lru lock. Then, not necessary to preempt_disable() - */ +/* for encoding cft->private value on file */ +#define _MEM (0) +#define _MEMSWAP (1) +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +static void mem_cgroup_get(struct mem_cgroup *mem); +static void mem_cgroup_put(struct mem_cgroup *mem); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) @@ -167,10 +211,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); - VM_BUG_ON(!irqs_disabled()); - - cpustat = &stat->cpustat[smp_processor_id()]; + cpustat = &stat->cpustat[cpu]; if (PageCgroupCache(pc)) __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); else @@ -182,6 +225,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); + put_cpu(); } static struct mem_cgroup_per_zone * @@ -197,6 +241,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) int nid = page_cgroup_nid(pc); int zid = page_cgroup_zid(pc); + if (!mem) + return NULL; + return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -236,118 +283,169 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, - struct page_cgroup *pc) +static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { - int lru = LRU_BASE; - - if (PageCgroupUnevictable(pc)) - lru = LRU_UNEVICTABLE; - else { - if (PageCgroupActive(pc)) - lru += LRU_ACTIVE; - if (PageCgroupFile(pc)) - lru += LRU_FILE; - } - - MEM_CGROUP_ZSTAT(mz, lru) -= 1; - - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); - list_del(&pc->lru); + struct mem_cgroup *mem = NULL; + /* + * Because we have no locks, mm->owner's may be being moved to other + * cgroup. We use css_tryget() here even if this looks + * pessimistic (rather than adding locks here). + */ + rcu_read_lock(); + do { + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + break; + } while (!css_tryget(&mem->css)); + rcu_read_unlock(); + return mem; } -static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, - struct page_cgroup *pc) +static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) { - int lru = LRU_BASE; + if (!mem) + return true; + return css_is_removed(&mem->css); +} - if (PageCgroupUnevictable(pc)) - lru = LRU_UNEVICTABLE; - else { - if (PageCgroupActive(pc)) - lru += LRU_ACTIVE; - if (PageCgroupFile(pc)) - lru += LRU_FILE; - } +/* + * Following LRU functions are allowed to be used without PCG_LOCK. + * Operations are called by routine of global LRU independently from memcg. + * What we have to take care of here is validness of pc->mem_cgroup. + * + * Changes to pc->mem_cgroup happens when + * 1. charge + * 2. moving account + * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. + * It is added to LRU before charge. + * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. + * When moving account, the page is not on LRU. It's isolated. + */ - MEM_CGROUP_ZSTAT(mz, lru) += 1; - list_add(&pc->lru, &mz->lists[lru]); +void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) +{ + struct page_cgroup *pc; + struct mem_cgroup *mem; + struct mem_cgroup_per_zone *mz; - mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); + if (mem_cgroup_disabled()) + return; + pc = lookup_page_cgroup(page); + /* can happen while we handle swapcache. */ + if (list_empty(&pc->lru) || !pc->mem_cgroup) + return; + /* + * We don't check PCG_USED bit. It's cleared when the "page" is finally + * removed from global LRU. + */ + mz = page_cgroup_zoneinfo(pc); + mem = pc->mem_cgroup; + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + list_del_init(&pc->lru); + return; } -static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) +void mem_cgroup_del_lru(struct page *page) { - struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); - int active = PageCgroupActive(pc); - int file = PageCgroupFile(pc); - int unevictable = PageCgroupUnevictable(pc); - enum lru_list from = unevictable ? LRU_UNEVICTABLE : - (LRU_FILE * !!file + !!active); + mem_cgroup_del_lru_list(page, page_lru(page)); +} - if (lru == from) +void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) return; - MEM_CGROUP_ZSTAT(mz, from) -= 1; + pc = lookup_page_cgroup(page); |