diff options
Diffstat (limited to 'mm/page_cgroup.c')
| -rw-r--r-- | mm/page_cgroup.c | 341 |
1 files changed, 183 insertions, 158 deletions
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5bffada7cde..3708264d283 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,14 +11,6 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) -{ - pc->flags = 0; - pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); - INIT_LIST_HEAD(&pc->lru); -} static unsigned long total_usage; #if !defined(CONFIG_SPARSEMEM) @@ -36,35 +28,37 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) struct page_cgroup *base; base = NODE_DATA(page_to_nid(page))->node_page_cgroup; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (unlikely(!base)) return NULL; - +#endif offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; return base + offset; } static int __init alloc_node_page_cgroup(int nid) { - struct page_cgroup *base, *pc; + struct page_cgroup *base; unsigned long table_size; - unsigned long start_pfn, nr_pages, index; + unsigned long nr_pages; - start_pfn = NODE_DATA(nid)->node_start_pfn; nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) return 0; table_size = sizeof(struct page_cgroup) * nr_pages; - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; - for (index = 0; index < nr_pages; index++) { - pc = base + index; - __init_page_cgroup(pc, start_pfn + index); - } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; return 0; @@ -99,69 +93,89 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); - +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_cgroup arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ if (!section->page_cgroup) return NULL; +#endif return section->page_cgroup + pfn; } -/* __alloc_bootmem...() is protected by !slab_available() */ -static int __init_refok init_section_page_cgroup(unsigned long pfn) +static void *__meminit alloc_page_cgroup(size_t size, int nid) { - struct mem_section *section = __pfn_to_section(pfn); - struct page_cgroup *base, *pc; - unsigned long table_size; - int nid, index; - - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); + return addr; } + if (node_state(nid, N_HIGH_MEMORY)) + addr = vzalloc_node(size, nid); + else + addr = vzalloc(size); + + return addr; +} + +static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) +{ + struct mem_section *section; + struct page_cgroup *base; + unsigned long table_size; + + section = __pfn_to_section(pfn); + + if (section->page_cgroup) + return 0; + + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = alloc_page_cgroup(table_size, nid); + + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); + if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); return -ENOMEM; } - for (index = 0; index < PAGES_PER_SECTION; index++) { - pc = base + index; - __init_page_cgroup(pc, pfn + index); - } - + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; section->page_cgroup = base - pfn; total_usage += table_size; return 0; } #ifdef CONFIG_MEMORY_HOTPLUG -void __free_page_cgroup(unsigned long pfn) +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -170,32 +184,34 @@ void __free_page_cgroup(unsigned long pfn) if (!ms || !ms->page_cgroup) return; base = ms->page_cgroup + pfn; - if (is_vmalloc_addr(base)) { - vfree(base); - ms->page_cgroup = NULL; - } else { - struct page *page = virt_to_page(base); - if (!PageReserved(page)) { /* Is bootmem ? */ - kfree(base); - ms->page_cgroup = NULL; - } - } + free_page_cgroup(base); + ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == -1) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_state(nid, N_ONLINE)); + } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { if (!pfn_present(pfn)) continue; - fail = init_section_page_cgroup(pfn); + fail = init_section_page_cgroup(pfn, nid); } if (!fail) return 0; @@ -207,13 +223,13 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); @@ -236,6 +252,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, mn->nr_pages, mn->status_change_nid); break; case MEM_CANCEL_ONLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: @@ -243,12 +262,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } #endif @@ -256,25 +270,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, void __init page_cgroup_init(void) { unsigned long pfn; - int fail = 0; + int nid; if (mem_cgroup_disabled()) return; - for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn); - } - if (fail) { - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); - } else { - hotplug_memory_notifier(page_cgroup_callback, 0); + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; + pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_cgroup(pfn, nid)) + goto oom; + } } + hotplug_memory_notifier(page_cgroup_callback, 0); printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" - " want memory cgroups\n"); + printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " + "don't want memory cgroups\n"); + return; +oom: + printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); + panic("Out of memory"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -285,7 +321,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { @@ -294,13 +330,12 @@ struct swap_cgroup_ctrl { spinlock_t lock; }; -struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) -#define SC_POS_MASK (SC_PER_PAGE - 1) /* * SwapCgroup implements "lookup" and "exchange" operations. @@ -342,33 +377,42 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @end: swap entry to be cmpxchged + * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * * Returns old id at success, 0 at failure. - * (There is no mem_cgroup useing 0 as its id) + * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) @@ -382,28 +426,20 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into - * @mem: mem_cgroup to be recorded + * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; struct swap_cgroup_ctrl *ctrl; - struct page *mappage; struct swap_cgroup *sc; unsigned short old; unsigned long flags; - ctrl = &swap_cgroup_ctrl[type]; + sc = lookup_swap_cgroup(ent, &ctrl); - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; @@ -413,28 +449,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) } /** - * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ -unsigned short lookup_swap_cgroup(swp_entry_t ent) +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { - int type = swp_type(ent); - unsigned long offset = swp_offset(ent); - unsigned long idx = offset / SC_PER_PAGE; - unsigned long pos = offset & SC_POS_MASK; - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - unsigned short ret; - - ctrl = &swap_cgroup_ctrl[type]; - mappage = ctrl->map[idx]; - sc = page_address(mappage); - sc += pos; - ret = sc->id; - return ret; + return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) @@ -447,14 +469,13 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) if (!do_swap_account) return 0; - length = ((max_pages/SC_PER_PAGE) + 1); + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); - array = vmalloc(array_size); + array = vzalloc(array_size); if (!array) goto nomem; - memset(array, 0, array_size); ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->length = length; @@ -464,8 +485,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) /* memory shortage */ ctrl->map = NULL; ctrl->length = 0; - vfree(array); mutex_unlock(&swap_cgroup_mutex); + vfree(array); goto nomem; } mutex_unlock(&swap_cgroup_mutex); @@ -474,13 +495,14 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option\n"); + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { - int i; + struct page **map; + unsigned long i, length; struct swap_cgroup_ctrl *ctrl; if (!do_swap_account) @@ -488,17 +510,20 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; - if (ctrl->map) { - for (i = 0; i < ctrl->length; i++) { - struct page *page = ctrl->map[i]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; if (page) __free_page(page); } - vfree(ctrl->map); - ctrl->map = NULL; - ctrl->length = 0; + vfree(map); } - mutex_unlock(&swap_cgroup_mutex); } #endif |
