diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 17 | ||||
-rw-r--r-- | mm/filemap.c | 12 | ||||
-rw-r--r-- | mm/filemap_xip.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 88 | ||||
-rw-r--r-- | mm/memory.c | 47 | ||||
-rw-r--r-- | mm/mmap.c | 7 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 9 | ||||
-rw-r--r-- | mm/page_alloc.c | 15 | ||||
-rw-r--r-- | mm/quicklist.c | 12 | ||||
-rw-r--r-- | mm/slab.c | 68 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 139 | ||||
-rw-r--r-- | mm/sparse.c | 20 |
14 files changed, 336 insertions, 111 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c070ec0c15b..0016ebd4dcb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -112,18 +112,17 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC -# -# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page -# and page_to_pfn. The most efficient option where kernel virtual space is -# not under pressure. -# config SPARSEMEM_VMEMMAP_ENABLE def_bool n config SPARSEMEM_VMEMMAP - bool - depends on SPARSEMEM - default y if (SPARSEMEM_VMEMMAP_ENABLE) + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG @@ -188,7 +187,7 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if (SUPERH && !SUPERH64) + default "2" if SUPERH default "1" config VIRT_TO_BUS diff --git a/mm/filemap.c b/mm/filemap.c index 455119cc7f4..89ce6fe5f8b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,18 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index e233fff61b4..f874ae818ad 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -25,14 +25,15 @@ static struct page *__xip_sparse_page; static struct page *xip_sparse_page(void) { if (!__xip_sparse_page) { - unsigned long zeroes = get_zeroed_page(GFP_HIGHUSER); - if (zeroes) { + struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); + + if (page) { static DEFINE_SPINLOCK(xip_alloc_lock); spin_lock(&xip_alloc_lock); if (!__xip_sparse_page) - __xip_sparse_page = virt_to_page(zeroes); + __xip_sparse_page = page; else - free_page(zeroes); + __free_page(page); spin_unlock(&xip_alloc_lock); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6121b57bbe9..db861d8b6c2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,7 +31,7 @@ static unsigned int free_huge_pages_node[MAX_NUMNODES]; static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -int hugetlb_dynamic_pool; +unsigned long nr_overcommit_huge_pages; static int hugetlb_next_nid; /* @@ -227,22 +227,58 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, unsigned long address) { struct page *page; + unsigned int nid; - /* Check if the dynamic pool is enabled */ - if (!hugetlb_dynamic_pool) + /* + * Assume we will successfully allocate the surplus page to + * prevent racing processes from causing the surplus to exceed + * overcommit + * + * This however introduces a different race, where a process B + * tries to grow the static hugepage pool while alloc_pages() is + * called by process A. B will only examine the per-node + * counters in determining if surplus huge pages can be + * converted to normal huge pages in adjust_pool_surplus(). A + * won't be able to increment the per-node counter, until the + * lock is dropped by B, but B doesn't drop hugetlb_lock until + * no more huge pages can be converted from surplus to normal + * state (and doesn't try to convert again). Thus, we have a + * case where a surplus huge page exists, the pool is grown, and + * the surplus huge page still exists after, even though it + * should just have been converted to a normal huge page. This + * does not leak memory, though, as the hugepage will be freed + * once it is out of use. It also does not allow the counters to + * go out of whack in adjust_pool_surplus() as we don't modify + * the node values until we've gotten the hugepage and only the + * per-node value is checked there. + */ + spin_lock(&hugetlb_lock); + if (surplus_huge_pages >= nr_overcommit_huge_pages) { + spin_unlock(&hugetlb_lock); return NULL; + } else { + nr_huge_pages++; + surplus_huge_pages++; + } + spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, HUGETLB_PAGE_ORDER); + + spin_lock(&hugetlb_lock); if (page) { + nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[page_to_nid(page)]++; - surplus_huge_pages++; - surplus_huge_pages_node[page_to_nid(page)]++; - spin_unlock(&hugetlb_lock); + /* + * We incremented the global counters already + */ + nr_huge_pages_node[nid]++; + surplus_huge_pages_node[nid]++; + } else { + nr_huge_pages--; + surplus_huge_pages--; } + spin_unlock(&hugetlb_lock); return page; } @@ -382,9 +418,14 @@ static struct page *alloc_huge_page_private(struct vm_area_struct *vma, if (free_huge_pages > resv_huge_pages) page = dequeue_huge_page(vma, addr); spin_unlock(&hugetlb_lock); - if (!page) + if (!page) { page = alloc_buddy_huge_page(vma, addr); - return page ? page : ERR_PTR(-VM_FAULT_OOM); + if (!page) { + hugetlb_put_quota(vma->vm_file->f_mapping, 1); + return ERR_PTR(-VM_FAULT_OOM); + } + } + return page; } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -481,6 +522,12 @@ static unsigned long set_max_huge_pages(unsigned long count) * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); while (surplus_huge_pages && count > persistent_huge_pages) { @@ -509,6 +556,14 @@ static unsigned long set_max_huge_pages(unsigned long count) * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. */ min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; min_count = max(count, min_count); @@ -644,6 +699,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; + + /* If the pagetables are shared don't copy or take references */ + if (dst_pte == src_pte) + continue; + spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { @@ -907,7 +967,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - if (!pte || pte_none(*pte)) { + if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { int ret; spin_unlock(&mm->page_table_lock); @@ -1156,8 +1216,10 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; ret = hugetlb_acct_memory(chg); - if (ret < 0) + if (ret < 0) { + hugetlb_put_quota(inode->i_mapping, chg); return ret; + } region_add(&inode->i_mapping->private_list, from, to); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 4bf0b6d0eb2..d902d0e25ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -392,6 +392,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ return NULL; } +#ifdef CONFIG_DEBUG_VM /* * Add some anal sanity checks for now. Eventually, * we should just do "return pfn_to_page(pfn)", but @@ -402,6 +403,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ print_bad_pte(vma, pte, addr); return NULL; } +#endif /* * NOTE! We still have PageReserved() pages in the page @@ -511,8 +513,7 @@ again: if (progress >= 32) { progress = 0; if (need_resched() || - need_lockbreak(src_ptl) || - need_lockbreak(dst_ptl)) + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { @@ -851,7 +852,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || - (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { *tlbp = NULL; goto out; @@ -1668,6 +1669,9 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty @@ -1763,8 +1767,7 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || - need_lockbreak(details->i_mmap_lock); + need_break = need_resched() || spin_needbreak(details->i_mmap_lock); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -2341,6 +2344,9 @@ out_unlocked: if (anon) page_cache_release(vmf.page); else if (dirty_page) { + if (vma->vm_file) + file_update_time(vma->vm_file); + set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } @@ -2748,3 +2754,34 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in return buf - old_buf; } + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) { + char *p, *s; + + p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + s = strrchr(p, '/'); + if (s) + p = s+1; + printk("%s%s[%lx+%lx]", prefix, p, + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + up_read(¤t->mm->mmap_sem); +} diff --git a/mm/mmap.c b/mm/mmap.c index 15678aa6ec7..d2b6d44962b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -251,7 +251,8 @@ asmlinkage unsigned long sys_brk(unsigned long brk) * not page aligned -Ram Gupta */ rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + + (mm->end_data - mm->start_data) > rlim) goto out; newbrk = PAGE_ALIGN(brk); @@ -1620,7 +1621,7 @@ static inline int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; address &= PAGE_MASK; - error = security_file_mmap(0, 0, 0, 0, address, 1); + error = security_file_mmap(NULL, 0, 0, 0, address, 1); if (error) return error; @@ -1941,7 +1942,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (is_hugepage_only_range(mm, addr, len)) return -EINVAL; - error = security_file_mmap(0, 0, 0, 0, addr, 1); + error = security_file_mmap(NULL, 0, 0, 0, addr, 1); if (error) return error; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 91a081a82f5..96473b48209 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose) * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->time_slice = HZ; + p->rt.time_slice = HZ; set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d55cfcae2ef..3d3848fa632 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -558,7 +558,6 @@ static void background_writeout(unsigned long _min_pages) global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; - wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; @@ -566,9 +565,8 @@ static void background_writeout(unsigned long _min_pages) min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else + congestion_wait(WRITE, HZ/10); + if (!wbc.encountered_congestion) break; } } @@ -633,12 +631,11 @@ static void wb_kupdate(unsigned long arg) global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { - wbc.more_io = 0; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) + if (wbc.encountered_congestion) congestion_wait(WRITE, HZ/10); else break; /* All the old data is written */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a58d476c1..b2838c24e58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -847,8 +847,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; + + /* + * Split buddy pages returned by expand() are received here + * in physical page order. The page is added to the callers and + * list and the list head then moves forward. From the callers + * perspective, the linked list is ordered by page number in + * some conditions. This is useful for IO devices that can + * merge IO requests if the physical pages are ordered + * properly. + */ list_add(&page->lru, list); set_page_private(page, migratetype); + list = &page->lru; } spin_unlock(&zone->lock); return i; @@ -2555,7 +2566,7 @@ static void __meminit zone_init_free_lists(struct pglist_data *pgdat, memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __devinit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { int batch; @@ -3427,7 +3438,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) mem_map = NODE_DATA(0)->node_mem_map; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= pgdat->node_start_pfn; + mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ } #endif diff --git a/mm/quicklist.c b/mm/quicklist.c index ae8189c2799..3f703f7cb39 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,9 +26,17 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + + node_free_pages = +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + +#endif + zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); - node_free_pages = node_page_state(numa_node_id(), - NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; return max(max, min_pages); } diff --git a/mm/slab.c b/mm/slab.c index 2e338a5f7b1..40c00dacbe4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -304,11 +304,11 @@ struct kmem_list3 { /* * Need this for bootstrapping a per node allocator. */ -#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) +#define NUM_INIT_LISTS (3 * MAX_NUMNODES) struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 -#define SIZE_AC 1 -#define SIZE_L3 (1 + MAX_NUMNODES) +#define SIZE_AC MAX_NUMNODES +#define SIZE_L3 (2 * MAX_NUMNODES) static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); @@ -730,8 +730,7 @@ static inline void init_lock_keys(void) #endif /* - * 1. Guard access to the cache-chain. - * 2. Protect sanity of cpu_online_map against cpu hotplug events + * Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, int err = 0; switch (action) { - case CPU_LOCK_ACQUIRE: - mutex_lock(&cache_chain_mutex); - break; case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: + mutex_lock(&cache_chain_mutex); err = cpuup_prepare(cpu); + mutex_unlock(&cache_chain_mutex); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, #endif case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: + mutex_lock(&cache_chain_mutex); cpuup_canceled(cpu); - break; - case CPU_LOCK_RELEASE: mutex_unlock(&cache_chain_mutex); break; } @@ -1410,6 +1407,22 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, } /* + * For setting up all the kmem_list3s for cache whose buffer_size is same as + * size of kmem_list3. + */ +static void __init set_up_list3s(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->nodelists[node] = &initkmem_list3[index + node]; + cachep->nodelists[node]->next_reap = jiffies + + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + } +} + +/* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). */ @@ -1432,6 +1445,7 @@ void __init kmem_cache_init(void) if (i < MAX_NUMNODES) cache_cache.nodelists[i] = NULL; } + set_up_list3s(&cache_cache, CACHE_CACHE); /* * Fragmentation resistance on low memory - only use bigger @@ -1587,10 +1601,9 @@ void __init kmem_cache_init(void) { int nid; - /* Replace the static kmem_list3 structures for the boot cpu */ - init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); + for_each_online_node(nid) { + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], nid); - for_each_node_state(nid, N_NORMAL_MEMORY) { init_list(malloc_sizes[INDEX_AC].cs_cachep, &initkmem_list3[SIZE_AC + nid], nid); @@ -1960,22 +1973,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) } } -/* - * For setting up all the kmem_list3s for cache whose buffer_size is same as - * size of kmem_list3. - */ -static void __init set_up_list3s(struct kmem_cache *cachep, int index) -{ - int node; - - for_each_node_state(node, N_NORMAL_MEMORY) { - cachep->nodelists[node] = &initkmem_list3[index + node]; - cachep->nodelists[node]->next_reap = jiffies + - REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - } -} - static void __kmem_cache_destroy(struct kmem_cache *cachep) { int i; @@ -2099,7 +2096,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) g_cpucache_up = PARTIAL_L3; } else { int node; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_online_node(node) { cachep->nodelists[node] = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); @@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, * We use cache_chain_mutex to ensure a consistent view of * cpu_online_map as well. Please see cpuup_callback */ + get_online_cpus(); mutex_lock(&cache_chain_mutex); list_for_each_entry(pc, &cache_chain, next) { @@ -2396,6 +2394,7 @@ oops: panic("kmem_cache_create(): failed to create slab `%s'\n", name); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return cachep; } EXPORT_SYMBOL(kmem_cache_create); @@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep) int ret; BUG_ON(!cachep || in_interrupt()); + get_online_cpus(); mutex_lock(&cache_chain_mutex); ret = __cache_shrink(cachep); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return ret; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) BUG_ON(!cachep || in_interrupt()); /* Find the cache in the chain of caches. */ + get_online_cpus(); mutex_lock(&cache_chain_mutex); /* * the chain is never empty, cache_cache is never destroyed @@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) slab_error(cachep, "Can't free all objects"); list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); return; } @@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); + put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3815,7 +3819,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) struct array_cache *new_shared; struct array_cache **new_alien = NULL; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_online_node(node) { if (use_alien_caches) { new_alien = alloc_alien_cache(node, cachep->limit); @@ -4105,7 +4109,7 @@ out: schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); } -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_SLABINFO static void print_slabinfo_header(struct seq_file *m) { diff --git a/mm/slob.c b/mm/slob.c index ee2ef8af0d4..773a7aa80ab 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) /* Not enough space: must allocate a new page */ if (!b) { - b = slob_new_page(gfp, 0, node); + b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); if (!b) return 0; sp = (struct slob_page *)virt_to_page(b); diff --git a/mm/slub.c b/mm/slub.c index b9f37cb0f2e..5cc4b7dddb5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -172,7 +172,7 @@ static inline void ClearSlabDebug(struct page *page) * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ -#define MIN_PARTIAL 2 +#define MIN_PARTIAL 5 /* * Maximum number of desirable partial slabs. @@ -1613,7 +1613,7 @@ checks_ok: * then add it. */ if (unlikely(!prior)) - add_partial(get_node(s, page_to_nid(page)), page); + add_partial_tail(get_node(s, page_to_nid(page)), page); out_unlock: slab_unlock(page); @@ -3076,6 +3076,19 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return slab_alloc(s, gfpflags, node, caller); } +static unsigned long count_partial(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += page->inuse; + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -3458,19 +3471,6 @@ static int list_locations(struct kmem_cache *s, char *buf, return n; } -static unsigned long count_partial(struct kmem_cache_node *n) -{ - unsigned long flags; - unsigned long x = 0; - struct page *page; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; -} - enum slab_stat_type { SL_FULL, SL_PARTIAL, @@ -3962,7 +3962,7 @@ static struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; -static decl_subsys(slab, &slab_ktype, &slab_uevent_ops); +static struct kset *slab_kset; #define ID_STR_LENGTH 64 @@ -4015,7 +4015,7 @@ static int sysfs_slab_add(struct kmem_cache *s) * This is typically the case for debug situations. In that * case we can catch duplicate names easily. */ - sysfs_remove_link(&slab_subsys.kobj, s->name); + sysfs_remove_link(&slab_kset->kobj, s->name); name = s->name; } else { /* @@ -4025,12 +4025,12 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - kobj_set_kset_s(s, slab_subsys); - kobject_set_name(&s->kobj, name); - kobject_init(&s->kobj); - err = kobject_add(&s->kobj); - if (err) + s->kobj.kset = slab_kset; + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); + if (err) { + kobject_put(&s->kobj); return err; + } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -4070,9 +4070,8 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) /* * If we have a leftover link then remove it. */ - sysfs_remove_link(&slab_subsys.kobj, name); - return sysfs_create_link(&slab_subsys.kobj, - &s->kobj, name); + sysfs_remove_link(&slab_kset->kobj, name); + return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); @@ -4091,8 +4090,8 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; - err = subsystem_register(&slab_subsys); - if (err) { + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + if (!slab_kset) { printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -4123,3 +4122,89 @@ static int __init slab_sysfs_init(void) __initcall(slab_sysfs_init); #endif + +/* + * The /proc/slabinfo ABI + */ +#ifdef CONFIG_SLABINFO + +ssize_t slabinfo_write(struct file *file, const char __user * buffer, + size_t count, loff_t *ppos) +{ + return -EINVAL; +} + + +static void print_slabinfo_header(struct seq_file *m) +{ + seq_puts(m, "slabinfo - version: 2.1\n"); + seq_puts(m, "# name <active_objs> <num_objs> <objsize> " + "<objperslab> <pagesperslab>"); + seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); + seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + + down_read(&slub_lock); + if (!n) + print_slabinfo_header(m); + + return seq_list_start(&slab_caches, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void s_stop(struct seq_file *m, void *p) +{ + up_read(&slub_lock); +} + +static int s_show(struct seq_file *m, void *p) +{ + unsigned long nr_partials = 0; + unsigned long nr_slabs = 0; + unsigned long nr_inuse = 0; + unsigned long nr_objs; + struct kmem_cache *s; + int node; + + s = list_entry(p, struct kmem_cache, list); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (!n) + continue; + + nr_partials += n->nr_partial; + nr_slabs += atomic_long_read(&n->nr_slabs); + nr_inuse += count_partial(n); + } + + nr_objs = nr_slabs * s->objects; + nr_inuse += (nr_slabs - nr_partials) * s->objects; + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, + nr_objs, s->size, s->objects, (1 << s->order)); + seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); + seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, + 0UL); + seq_putc(m, '\n'); + return 0; +} + +const struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +#endif /* CONFIG_SLABINFO */ diff --git a/mm/sparse.c b/mm/sparse.c index e06f514fe04..a2183cb5d52 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,6 +83,8 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) return -EEXIST; section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; /* * This lock keeps two different sections from * reallocating for the same index @@ -389,9 +391,17 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, * no locking for this, because it does its own * plus, it does a kmalloc */ - sparse_index_init(section_nr, pgdat->node_id); + ret = sparse_index_init(section_nr, pgdat->node_id); + if (ret < 0 && ret != -EEXIST) + return ret; memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + if (!memmap) + return -ENOMEM; usemap = __kmalloc_section_usemap(); + if (!usemap) { + __kfree_section_memmap(memmap, nr_pages); + return -ENOMEM; + } pgdat_resize_lock(pgdat, &flags); @@ -401,18 +411,16 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } - if (!usemap) { - ret = -ENOMEM; - goto out; - } ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); out: pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) + if (ret <= 0) { + kfree(usemap); __kfree_section_memmap(memmap, nr_pages); + } return ret; } #endif |