diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-02 16:07:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-02 16:07:27 -0700 |
commit | 092f4c56c1927e4b61a41ee8055005f1cb437009 (patch) | |
tree | 616ceb54b7671ccc13922ae9e002b8b972f6e09e /mm | |
parent | 80c2861672bbf000f6af838656959ee937e4ee4d (diff) | |
parent | c1e2ee2dc436574880758b3836fc96935b774c32 (diff) |
Merge branch 'akpm' (Andrew's incoming - part two)
Says Andrew:
"60 patches. That's good enough for -rc1 I guess. I have quite a lot
of detritus to be rechecked, work through maintainers, etc.
- most of the remains of MM
- rtc
- various misc
- cgroups
- memcg
- cpusets
- procfs
- ipc
- rapidio
- sysctl
- pps
- w1
- drivers/misc
- aio"
* akpm: (60 commits)
memcg: replace ss->id_lock with a rwlock
aio: allocate kiocbs in batches
drivers/misc/vmw_balloon.c: fix typo in code comment
drivers/misc/vmw_balloon.c: determine page allocation flag can_sleep outside loop
w1: disable irqs in critical section
drivers/w1/w1_int.c: multiple masters used same init_name
drivers/power/ds2780_battery.c: fix deadlock upon insertion and removal
drivers/power/ds2780_battery.c: add a nolock function to w1 interface
drivers/power/ds2780_battery.c: create central point for calling w1 interface
w1: ds2760 and ds2780, use ida for id and ida_simple_get() to get it
pps gpio client: add missing dependency
pps: new client driver using GPIO
pps: default echo function
include/linux/dma-mapping.h: add dma_zalloc_coherent()
sysctl: make CONFIG_SYSCTL_SYSCALL default to n
sysctl: add support for poll()
RapidIO: documentation update
drivers/net/rionet.c: fix ethernet address macros for LE platforms
RapidIO: fix potential null deref in rio_setup_device()
RapidIO: add mport driver for Tsi721 bridge
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 37 | ||||
-rw-r--r-- | mm/internal.h | 46 | ||||
-rw-r--r-- | mm/memcontrol.c | 1006 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/page_cgroup.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 83 | ||||
-rw-r--r-- | mm/vmscan.c | 4 |
7 files changed, 641 insertions, 546 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ec211ddd..4298abaae15 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); diff --git a/mm/internal.h b/mm/internal.h index d071d380fb4..2189af49178 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d5755544af..7af1d5ee159 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -201,8 +201,8 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; -static void mem_cgroup_threshold(struct mem_cgroup *mem); -static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* * The memory controller data structure. The memory controller controls both @@ -362,29 +362,29 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) -static void mem_cgroup_get(struct mem_cgroup *mem); -static void mem_cgroup_put(struct mem_cgroup *mem); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(struct mem_cgroup *mem); +static void mem_cgroup_get(struct mem_cgroup *memcg); +static void mem_cgroup_put(struct mem_cgroup *memcg); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); +static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) { - return &mem->css; + return &memcg->css; } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) +page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(mem, nid, zid); + return mem_cgroup_zoneinfo(memcg, nid, zid); } static struct mem_cgroup_tree_per_zone * @@ -403,7 +403,7 @@ soft_limit_tree_from_page(struct page *page) } static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, unsigned long long new_usage_in_excess) @@ -437,7 +437,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, } static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { @@ -448,17 +448,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, } static void -mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); spin_unlock(&mctz->lock); } -static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long long excess; struct mem_cgroup_per_zone *mz; @@ -471,9 +471,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ - for (; mem; mem = parent_mem_cgroup(mem)) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - excess = res_counter_soft_limit_excess(&mem->res); + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -482,18 +482,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); spin_unlock(&mctz->lock); } } } -static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { int node, zone; struct mem_cgroup_per_zone *mz; @@ -501,9 +501,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) for_each_node_state(node, N_POSSIBLE) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(mem, node, zone); + mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(mem, mz, mctz); + mem_cgroup_remove_exceeded(memcg, mz, mctz); } } } @@ -564,7 +564,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static long mem_cgroup_read_stat(struct mem_cgroup *mem, +static long mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { long val = 0; @@ -572,81 +572,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, get_online_cpus(); for_each_online_cpu(cpu) - val += per_cpu(mem->stat->count[idx], cpu); + val += per_cpu(memcg->stat->count[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.count[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif put_online_cpus(); return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); } -void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); } -static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, +static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { unsigned long val = 0; int cpu; for_each_online_cpu(cpu) - val += per_cpu(mem->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.events[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.events[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif return val; } -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, bool file, int nr_pages) { preempt_disable(); if (file) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + nr_pages); /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; enum lru_list l; unsigned long ret = 0; - mz = mem_cgroup_zoneinfo(mem, nid, zid); + mz = mem_cgroup_zoneinfo(memcg, nid, zid); for_each_lru(l) { if (BIT(l) & lru_mask) @@ -656,44 +658,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, } static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { u64 total = 0; int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + total += mem_cgroup_zone_nr_lru_pages(memcg, + nid, zid, lru_mask); return total; } -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { int nid; u64 total = 0; for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); + total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int target) +static bool __memcg_event_check(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - next = this_cpu_read(mem->stat->targets[target]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ return ((long)next - (long)val < 0); } -static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); switch (target) { case MEM_CGROUP_TARGET_THRESH: @@ -709,34 +712,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) return; } - this_cpu_write(mem->stat->targets[target], next); + __this_cpu_write(memcg->stat->targets[target], next); } /* * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { + preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { - mem_cgroup_threshold(mem); - __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + mem_cgroup_threshold(memcg); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_SOFTLIMIT))) { - mem_cgroup_update_tree(mem, page); - __mem_cgroup_target_update(mem, + mem_cgroup_update_tree(memcg, page); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_NUMAINFO))) { - atomic_inc(&mem->numainfo_events); - __mem_cgroup_target_update(mem, + atomic_inc(&memcg->numainfo_events); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_NUMAINFO); } #endif } + preempt_enable(); } static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -762,7 +767,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; if (!mm) return NULL; @@ -773,25 +778,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) */ rcu_read_lock(); do { - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) break; - } while (!css_tryget(&mem->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); - return mem; + return memcg; } /* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; int found; - if (!mem) /* ROOT cgroup has the smallest ID */ + if (!memcg) /* ROOT cgroup has the smallest ID */ return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!mem->use_hierarchy) { - if (css_tryget(&mem->css)) - return mem; + if (!memcg->use_hierarchy) { + if (css_tryget(&memcg->css)) + return memcg; return NULL; } rcu_read_lock(); @@ -799,13 +804,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) * searching a memory cgroup which has the smallest ID under given * ROOT cgroup. (ID >= 1) */ - css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + memcg = container_of(css, struct mem_cgroup, css); else - mem = NULL; + memcg = NULL; rcu_read_unlock(); - return mem; + return memcg; } static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, @@ -859,29 +864,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, for_each_mem_cgroup_tree_cond(iter, NULL, true) -static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { - return (mem == root_mem_cgroup); + return (memcg == root_mem_cgroup); } void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; if (!mm) return; rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) goto out; switch (idx) { case PGMAJFAULT: - mem_cgroup_pgmajfault(mem, 1); + mem_cgroup_pgmajfault(memcg, 1); break; case PGFAULT: - mem_cgroup_pgfault(mem, 1); + mem_cgroup_pgfault(memcg, 1); break; default: BUG(); @@ -990,6 +995,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); if (!PageCgroupUsed(pc)) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ @@ -1041,7 +1056,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); - + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); /* taking care of that the page is added to LRU while we commit it */ if (likely(!PageLRU(page))) return; @@ -1063,21 +1087,21 @@ void mem_cgroup_move_lists(struct page *page, } /* - * Checks whether given mem is same or in the root_mem's + * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, - struct mem_cgroup *mem) +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) { - if (root_mem != mem) { - return (root_mem->use_hierarchy && - css_is_ancestor(&mem->css, &root_mem->css)); + if (root_memcg != memcg) { + return (root_memcg->use_hierarchy && + css_is_ancestor(&memcg->css, &root_memcg->css)); } return true; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) { int ret; struct mem_cgroup *curr = NULL; @@ -1091,25 +1115,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) if (!curr) return 0; /* - * We should check use_hierarchy of "mem" not "curr". Because checking + * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "mem" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "mem"). + * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "memcg"). */ - ret = mem_cgroup_same_or_subtree(mem, curr); + ret = mem_cgroup_same_or_subtree(memcg, curr); css_put(&curr->css); return ret; } -static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) { - unsigned long active; + unsigned long inactive_ratio; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); unsigned long inactive; + unsigned long active; unsigned long gb; - unsigned long inactive_ratio; - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1117,39 +1145,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ else inactive_ratio = 1; - if (present_pages) { - present_pages[0] = inactive; - present_pages[1] = active; - } - - return inactive_ratio; + return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) -{ - unsigned long active; - unsigned long inactive; - unsigned long present_pages[2]; - unsigned long inactive_ratio; - - inactive_ratio = calc_inactive_ratio(memcg, present_pages); - - inactive = present_pages[0]; - active = present_pages[1]; - - if (inactive * inactive_ratio < active) - return 1; - - return 0; -} - -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) { unsigned long active; unsigned long inactive; + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_FILE)); return (active > inactive); } @@ -1254,13 +1263,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * Returns the maximum amount of memory @mem can be charged with, in * pages. */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { unsigned long long margin; - margin = res_counter_margin(&mem->res); + margin = res_counter_margin(&memcg->res); if (do_swap_account) - margin = min(margin, res_counter_margin(&mem->memsw)); + margin = min(margin, res_counter_margin(&memcg->memsw)); return margin >> PAGE_SHIFT; } @@ -1275,33 +1284,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } -static void mem_cgroup_start_move(struct mem_cgroup *mem) +static void mem_cgroup_start_move(struct mem_cgroup *memcg) { int cpu; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); synchronize_rcu(); } -static void mem_cgroup_end_move(struct mem_cgroup *mem) +static void mem_cgroup_end_move(struct mem_cgroup *memcg) { int cpu; - if (!mem) + if (!memcg) return; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); } /* @@ -1316,13 +1325,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem) * waiting at hith-memory prressure caused by "move". */ -static bool mem_cgroup_stealed(struct mem_cgroup *mem) +static bool mem_cgroup_stealed(struct mem_cgroup *memcg) { VM_BUG_ON(!rcu_read_lock_held()); - return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; + return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; } -static bool mem_cgroup_under_move(struct mem_cgroup *mem) +static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; struct mem_cgroup *to; @@ -1337,17 +1346,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(mem, from) - || mem_cgroup_same_or_subtree(mem, to); + ret = mem_cgroup_same_or_subtree(memcg, from) + || mem_cgroup_same_or_subtree(memcg, to); unlock: spin_unlock(&mc.lock); return ret; } -static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) { if (mc.moving_task && current != mc.moving_task) { - if (mem_cgroup_under_move(mem)) { + if (mem_cgroup_under_move(memcg)) { DEFINE_WAIT(wait); prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); /* moving charge context might have finished. */ @@ -1431,12 +1440,12 @@ done: * This function returns the number of memcg under hierarchy tree. Returns * 1(self count) if no children. */ -static int mem_cgroup_count_children(struct mem_cgroup *mem) +static int mem_cgroup_count_children(struct mem_cgroup *memcg) { int num = 0; struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) num++; return num; } @@ -1466,21 +1475,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_mem) +mem_cgroup_select_victim(struct mem_cgroup *root_memcg) { struct mem_cgroup *ret = NULL; struct cgroup_subsys_state *css; int nextid, found; - if (!root_mem->use_hierarchy) { - css_get(&root_mem->css); - ret = root_mem; + if (!root_memcg->use_hierarchy) { + css_get(&root_memcg->css); + ret = root_memcg; } while (!ret) { rcu_read_lock(); - nextid = root_mem->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + nextid = root_memcg->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, &found); if (css && css_tryget(css)) ret = container_of(css, struct mem_cgroup, css); @@ -1489,9 +1498,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) /* Updates scanning parameter */ if (!css) { /* this means start scan from ID:1 */ - root_mem->last_scanned_child = 0; + root_memcg->last_scanned_child = 0; } else - root_mem->last_scanned_child = found; + root_memcg->last_scanned_child = found; } return ret; @@ -1507,14 +1516,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * reclaimable pages on a node. Returns true if there are any reclaimable * pages in the node. */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) return true; return false; @@ -1527,29 +1536,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, * nodes based on the zonelist. So update the list loosely once per 10 secs. * */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) { int nid; /* * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET * pagein/pageout changes since the last update. */ - if (!atomic_read(&mem->numainfo_events)) + if (!atomic_read(&memcg->numainfo_events)) return; - if (atomic_inc_return(&mem->numainfo_updating) > 1) + if (atomic_inc_return(&memcg->numainfo_updating) > 1) return; /* make a nodemask where this memcg uses memory from */ - mem->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_HIGH_MEMORY]; for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { - if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) - node_clear(nid, mem->scan_nodes); + if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) + node_clear(nid, memcg->scan_nodes); } - atomic_set(&mem->numainfo_events, 0); - atomic_set(&mem->numainfo_updating, 0); + atomic_set(&memcg->numainfo_events, 0); + atomic_set(&memcg->numainfo_updating, 0); } /* @@ -1564,16 +1573,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) * * Now, we use round-robin. Better algorithm is welcomed. */ -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { int node; - mem_cgroup_may_update_nodemask(mem); - node = mem->last_scanned_node; + mem_cgroup_may_upda |