diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/filemap.c | 85 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/madvise.c | 33 | ||||
-rw-r--r-- | mm/memory.c | 106 | ||||
-rw-r--r-- | mm/mmap.c | 43 | ||||
-rw-r--r-- | mm/oom_kill.c | 17 | ||||
-rw-r--r-- | mm/page-writeback.c | 50 | ||||
-rw-r--r-- | mm/page_alloc.c | 50 | ||||
-rw-r--r-- | mm/quicklist.c | 88 | ||||
-rw-r--r-- | mm/readahead.c | 29 | ||||
-rw-r--r-- | mm/rmap.c | 3 | ||||
-rw-r--r-- | mm/shmem.c | 3 | ||||
-rw-r--r-- | mm/slab.c | 193 | ||||
-rw-r--r-- | mm/slob.c | 57 | ||||
-rw-r--r-- | mm/slub.c | 3520 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 13 |
20 files changed, 4070 insertions, 235 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 7942b333e46..1ac718f636e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -163,3 +163,8 @@ config ZONE_DMA_FLAG default "0" if !ZONE_DMA default "1" +config NR_QUICK + int + depends on QUICKLIST + default "1" + diff --git a/mm/Makefile b/mm/Makefile index f3c077eb0b8..a9148ea329a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -25,7 +25,10 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o +obj-$(CONFIG_QUICKLIST) += quicklist.o + diff --git a/mm/filemap.c b/mm/filemap.c index 5dfc093ceb3..5631d6b2a62 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -868,6 +868,7 @@ void do_generic_mapping_read(struct address_space *mapping, unsigned long last_index; unsigned long next_index; unsigned long prev_index; + unsigned int prev_offset; loff_t isize; struct page *cached_page; int error; @@ -876,7 +877,8 @@ void do_generic_mapping_read(struct address_space *mapping, cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; - prev_index = ra.prev_page; + prev_index = ra.prev_index; + prev_offset = ra.prev_offset; last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; @@ -924,10 +926,10 @@ page_ok: flush_dcache_page(page); /* - * When (part of) the same page is read multiple times - * in succession, only mark it as accessed the first time. + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. */ - if (prev_index != index) + if (prev_index != index || offset != prev_offset) mark_page_accessed(page); prev_index = index; @@ -945,6 +947,8 @@ page_ok: offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + ra.prev_offset = offset; page_cache_release(page); if (ret == nr && desc->count) @@ -1446,30 +1450,6 @@ page_not_uptodate: majmin = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } - lock_page(page); - - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } /* * Umm, take care of errors if the page isn't up-to-date. @@ -1726,7 +1706,7 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); -static inline struct page *__read_cache_page(struct address_space *mapping, +static struct page *__read_cache_page(struct address_space *mapping, unsigned long index, int (*filler)(void *,struct page*), void *data) @@ -1763,17 +1743,11 @@ repeat: return page; } -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Read into the page cache. If a page already exists, - * and PageUptodate() is not set, try to fill the page. +/* + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. */ -struct page *read_cache_page(struct address_space *mapping, +struct page *read_cache_page_async(struct address_space *mapping, unsigned long index, int (*filler)(void *,struct page*), void *data) @@ -1805,6 +1779,39 @@ retry: page = ERR_PTR(err); } out: + mark_page_accessed(page); + return page; +} +EXPORT_SYMBOL(read_cache_page_async); + +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page then wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + + page = read_cache_page_async(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + out: return page; } EXPORT_SYMBOL(read_cache_page); diff --git a/mm/internal.h b/mm/internal.h index d527b80b292..a3110c02aea 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + VM_BUG_ON(PageCompound(page) && PageTail(page)); VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/madvise.c b/mm/madvise.c index 603c5257ed6..e75096b5a6d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -12,6 +12,24 @@ #include <linux/hugetlb.h> /* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +/* * We can potentially split a vm area into separate * areas, each area with its own behavior. */ @@ -183,9 +201,9 @@ static long madvise_remove(struct vm_area_struct *vma, + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ - up_write(¤t->mm->mmap_sem); + up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); - down_write(¤t->mm->mmap_sem); + down_read(¤t->mm->mmap_sem); return error; } @@ -270,7 +288,10 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) int error = -EINVAL; size_t len; - down_write(¤t->mm->mmap_sem); + if (madvise_need_mmap_write(behavior)) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) goto out; @@ -332,6 +353,10 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) vma = find_vma(current->mm, start); } out: - up_write(¤t->mm->mmap_sem); + if (madvise_need_mmap_write(behavior)) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + return error; } diff --git a/mm/memory.c b/mm/memory.c index e7066e71dfa..1d647ab0ee7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1448,6 +1448,100 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + struct page *pmd_page; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + pmd_page = pmd_page(*pmd); + + do { + err = fn(pte, pmd_page, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on @@ -2539,12 +2633,6 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED @@ -2573,12 +2661,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } -#else -/* Workaround for gcc 2.96 */ -int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - return 0; -} #endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) diff --git a/mm/mmap.c b/mm/mmap.c index 88da687bde8..52646d61ff6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1200,6 +1200,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (len > TASK_SIZE) return -ENOMEM; + if (flags & MAP_FIXED) + return addr; + if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); @@ -1273,6 +1276,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (len > TASK_SIZE) return -ENOMEM; + if (flags & MAP_FIXED) + return addr; + /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); @@ -1361,38 +1367,21 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long ret; - - if (!(flags & MAP_FIXED)) { - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - addr = get_area(file, addr, len, pgoff, flags); - if (IS_ERR_VALUE(addr)) - return addr; - } + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (addr & ~PAGE_MASK) return -EINVAL; - if (file && is_file_hugepages(file)) { - /* - * Check if the given range is hugepage aligned, and - * can be made suitable for hugepages. - */ - ret = prepare_hugepage_range(addr, len, pgoff); - } else { - /* - * Ensure that a normal request is not falling in a - * reserved hugepage range. For some archs like IA-64, - * there is a separate region for hugepages. - */ - ret = is_hugepage_only_range(current->mm, addr, len); - } - if (ret) - return -EINVAL; + return addr; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3791edfffee..a7001410ab1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -147,9 +147,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Adjust the score by oomkilladj. */ if (p->oomkilladj) { - if (p->oomkilladj > 0) + if (p->oomkilladj > 0) { + if (!points) + points = 1; points <<= p->oomkilladj; - else + } else points >>= -(p->oomkilladj); } @@ -397,6 +399,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) struct task_struct *p; unsigned long points = 0; unsigned long freed = 0; + int constraint; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) @@ -411,14 +414,18 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) show_mem(); } - cpuset_lock(); - read_lock(&tasklist_lock); + if (sysctl_panic_on_oom == 2) + panic("out of memory. Compulsory panic_on_oom is selected.\n"); /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - switch (constrained_alloc(zonelist, gfp_mask)) { + constraint = constrained_alloc(zonelist, gfp_mask); + cpuset_lock(); + read_lock(&tasklist_lock); + + switch (constraint) { case CONSTRAINT_MEMORY_POLICY: oom_kill_process(current, points, "No available memory (MPOL_BIND)"); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a794945fd19..029dfad5a23 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -119,6 +119,44 @@ static void background_writeout(unsigned long _min_pages); * We make sure that the background writeout level is below the adjusted * clamping level. */ + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_online_node(node) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_page_state(z, NR_INACTIVE) + + zone_page_state(z, NR_ACTIVE); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + + global_page_state(NR_INACTIVE) + + global_page_state(NR_ACTIVE); + x -= highmem_dirtyable_memory(x); + return x + 1; /* Ensure that we never return 0 */ +} + static void get_dirty_limits(long *pbackground, long *pdirty, struct address_space *mapping) @@ -128,20 +166,12 @@ get_dirty_limits(long *pbackground, long *pdirty, int unmapped_ratio; long background; long dirty; - unsigned long available_memory = vm_total_pages; + unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; -#ifdef CONFIG_HIGHMEM - /* - * We always exclude high memory from our count. - */ - available_memory -= totalhigh_pages; -#endif - - unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; + available_memory; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 353ce9039a8..59164313167 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -156,10 +156,8 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) static int page_is_consistent(struct zone *zone, struct page *page) { -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid_within(page_to_pfn(page))) return 0; -#endif if (zone != page_zone(page)) return 0; @@ -227,7 +225,7 @@ static void bad_page(struct page *page) static void free_compound_page(struct page *page) { - __free_pages_ok(page, (unsigned long)page[1].lru.prev); + __free_pages_ok(page, compound_order(page)); } static void prep_compound_page(struct page *page, unsigned long order) @@ -236,12 +234,13 @@ static void prep_compound_page(struct page *page, unsigned long order) int nr_pages = 1 << order; set_compound_page_dtor(page, free_compound_page); - page[1].lru.prev = (void *)order; - for (i = 0; i < nr_pages; i++) { + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageCompound(p); - set_page_private(p, (unsigned long)page); + __SetPageTail(p); + p->first_page = page; } } @@ -250,16 +249,19 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (unlikely((unsigned long)page[1].lru.prev != order)) + if (unlikely(compound_order(page) != order)) bad_page(page); - for (i = 0; i < nr_pages; i++) { + if (unlikely(!PageHead(page))) + bad_page(page); + __ClearPageHead(page); + for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - if (unlikely(!PageCompound(p) | - (page_private(p) != (unsigned long)page))) + if (unlikely(!PageTail(p) | + (p->first_page != page))) bad_page(page); - __ClearPageCompound(p); + __ClearPageTail(p); } } @@ -346,10 +348,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) static inline int page_is_buddy(struct page *page, struct page *buddy, int order) { -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(buddy))) + if (!pfn_valid_within(page_to_pfn(buddy))) return 0; -#endif if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -433,13 +433,18 @@ static inline int free_pages_check(struct page *page) 1 << PG_private | 1 << PG_locked | 1 << PG_active | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved | 1 << PG_buddy )))) bad_page(page); + /* + * PageReclaim == PageTail. It is only an error + * for PageReclaim to be set if PageCompound is clear. + */ + if (unlikely(!PageCompound(page) && PageReclaim(page))) + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -665,7 +670,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } #if MAX_NUMNODES > 1 -int nr_node_ids __read_mostly; +int nr_node_ids __read_mostly = MAX_NUMNODES; EXPORT_SYMBOL(nr_node_ids); /* @@ -770,8 +775,8 @@ void mark_free_pages(struct zone *zone) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - if (!PageNosave(page)) - ClearPageNosaveFree(page); + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); } for (order = MAX_ORDER - 1; order >= 0; --order) @@ -780,7 +785,7 @@ void mark_free_pages(struct zone *zone) pfn = page_to_pfn(list_entry(curr, struct page, lru)); for (i = 0; i < (1UL << order); i++) - SetPageNosaveFree(pfn_to_page(pfn + i)); + swsusp_set_page_free(pfn_to_page(pfn + i)); } spin_unlock_irqrestore(&zone->lock, flags); @@ -3203,7 +3208,8 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); - setup_per_zone_pages_min(); + if (write) + setup_per_zone_pages_min(); return 0; } diff --git a/mm/quicklist.c b/mm/quicklist.c new file mode 100644 index 00000000000..ae8189c2799 --- /dev/null +++ b/mm/quicklist.c @@ -0,0 +1,88 @@ +/* + * Quicklist support. + * + * Quicklists are light weight lists of pages that have a defined state + * on alloc and free. Pages must be in the quicklist specific defined state + * (zero by default) when the page is freed. It seems that the initial idea + * for such lists first came from Dave Miller and then various other people + * improved on it. + * + * Copyright (C) 2007 SGI, + * Christoph Lameter <clameter@sgi.com> + * Generalized, added support for multiple lists and + * constructors / destructors. + */ +#include <linux/kernel.h> + +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/quicklist.h> + +DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; + +#define FRACTION_OF_NODE_MEM 16 + +static unsigned long max_pages(unsigned long min_pages) +{ + unsigned long node_free_pages, max; + + node_free_pages = node_page_state(numa_node_id(), + NR_FREE_PAGES); + max = node_free_pages / FRACTION_OF_NODE_MEM; + return max(max, min_pages); +} + +static long min_pages_to_free(struct quicklist *q, + unsigned long min_pages, long max_free) +{ + long pages_to_free; + + pages_to_free = q->nr_pages - max_pages(min_pages); + + return min(pages_to_free, max_free); +} + +/* + * Trim down the number of pages in the quicklist + */ +void quicklist_trim(int nr, void (*dtor)(void *), + unsigned long min_pages, unsigned long max_free) +{ + long pages_to_free; + struct quicklist *q; + + q = &get_cpu_var(quicklist)[nr]; + if (q->nr_pages > min_pages) { + pages_to_free = min_pages_to_free(q, min_pages, max_free); + + while (pages_to_free > 0) { + /* + * We pass a gfp_t of 0 to quicklist_alloc here + * because we will never call into the page allocator. + */ + void *p = quicklist_alloc(nr, 0, NULL); + + if (dtor) + dtor(p); + free_page((unsigned long)p); + pages_to_free--; + } + } + put_cpu_var(quicklist); +} + +unsigned long quicklist_total_size(void) +{ + unsigned long count = 0; + int cpu; + struct quicklist *ql, *q; + + for_each_online_cpu(cpu) { + ql = per_cpu(quicklist, cpu); + for (q = ql; q < ql + CONFIG_NR_QUICK; q++) + count += q->nr_pages; + } + return count; +} + diff --git a/mm/readahead.c b/mm/readahead.c index 93d9ee692fd..9861e883fe5 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -37,7 +37,7 @@ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = mapping->backing_dev_info->ra_pages; - ra->prev_page = -1; + ra->prev_index = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); @@ -202,17 +202,19 @@ out: * size: Number of pages in that read * Together, these form the "current window". * Together, start and size represent the `readahead window'. - * prev_page: The page which the readahead algorithm most-recently inspected. + * prev_index: The page which the readahead algorithm most-recently inspected. * It is mainly used to detect sequential file reading. * If page_cache_readahead sees that it is again being called for * a page which it just looked at, it can return immediately without * making any state changes. + * offset: Offset in the prev_index where the last read ended - used for + * detection of sequential file reading. * ahead_start, * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * * When readahead is in the off state (size == 0), readahead is disabled. - * In this state, prev_page is used to detect the resumption of sequential I/O. + * In this state, prev_index is used to detect the resumption of sequential I/O. * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages @@ -415,7 +417,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, ra->ahead_size = get_next_ra_size(ra); ra->ahead_start = ra->start + ra->size; - block = force || (ra->prev_page >= ra->ahead_start); + block = force || (ra->prev_index >= ra->ahead_start); ret = blockable_page_cache_readahead(mapping, filp, ra->ahead_start, ra->ahead_size, ra, block); @@ -467,12 +469,13 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * We avoid doing extra work and bogusly perturbing the readahead * window expansion logic. */ - if (offset == ra->prev_page && --req_size) + if (offset == ra->prev_index && --req_size) ++offset; - /* Note that prev_page == -1 if it is a first read */ - sequential = (offset == ra->prev_page + 1); - ra->prev_page = offset; + /* Note that prev_index == -1 if it is a first read */ + sequential = (offset == ra->prev_index + 1); + ra->prev_index = offset; + ra->prev_offset = 0; max = get_max_readahead(ra); newsize = min(req_size, max); @@ -481,7 +484,7 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE)) goto out; - ra->prev_page += newsize - 1; + ra->prev_index += newsize - 1; /* * Special case - first read at start of file. We'll assume it's @@ -537,18 +540,18 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, * we get called back on the first page of the ahead window which * will allow us to submit more IO. */ - if (ra->prev_page >= ra->ahead_start) { + if (ra->prev_index >= ra->ahead_start) { ra->start = ra->ahead_start; ra->size = ra->ahead_size; make_ahead_window(mapping, filp, ra, 0); recheck: - /* prev_page shouldn't overrun the ahead window */ - ra->prev_page = min(ra->prev_page, + /* prev_index shouldn't overrun the ahead window */ + ra->prev_index = min(ra->prev_index, ra->ahead_start + ra->ahead_size - 1); } out: - return ra->prev_page + 1; + return ra->prev_index + 1; } EXPORT_SYMBOL_GPL(page_cache_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 59da5b734c8..75a32be64a2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -162,8 +162,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) static void anon_vma_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) { - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { + if (flags & SLAB_CTOR_CONSTRUCTOR) { struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); diff --git a/mm/shmem.c b/mm/shmem.c index b2a35ebf071..f01e8deed64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2358,8 +2358,7 @@ static void init_once(void *foo, struct kmem_cache *cachep, { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { + if (flags & SLAB_CTOR_CONSTRUCTOR) { inode_init_once(&p->vfs_inode); #ifdef CONFIG_TMPFS_POSIX_ACL p->i_acl = NULL; diff --git a/mm/slab.c b/mm/slab.c index 168bfe9d8ff..5920a412b37 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -116,8 +116,7 @@ #include <asm/page.h> /* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, - * SLAB_RED_ZONE & SLAB_POISON. + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * * STATS - 1 to collect stats for /proc/slabinfo. @@ -172,15 +171,15 @@ /* Legal flag mask for kmem_cache_create(). */ #if DEBUG -# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ +# define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ - SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #else # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ - SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) #endif @@ -389,7 +388,6 @@ struct kmem_cache { unsigned int buffer_size; u32 reciprocal_buffer_size; /* 3) touched by every alloc & free from the backend */ - struct kmem_list3 *nodelists[MAX_NUMNODES]; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -444,6 +442,17 @@ struct kmem_cache { int obj_offset; int obj_size; #endif + /* + * We put nodelists[] at the end of kmem_cache, because we want to size + * this array to nr_node_ids slots instead of MAX_NUMNODES + * (see kmem_cache_init()) + * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache + * is statically defined, so we reserve the max number of nodes. + */ + struct kmem_list3 *nodelists[MAX_NUMNODES]; + /* + * Do not add fields after nodelists[] + */ }; #define CFLGS_OFF_SLAB (0x80000000UL) @@ -592,8 +601,7 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) static inline struct kmem_cache *page_get_cache(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); + page = compound_head(page); BUG_ON(!PageSlab(page)); return (struct kmem_cache *)page->lru.next; } @@ -605,21 +613,19 @@ static inline void page_set_slab(struct page *page, struct slab *slab) static inline struct slab *page_get_slab(struct page *page) { - if (unlikely(PageCompound(page))) - page = (struct page *)page_private(page); BUG_ON(!PageSlab(page)); return (struct slab *)page->lru.prev; } static inline struct kmem_cache *virt_to_cache(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_cache(page); } static inline struct slab *virt_to_slab(const void *obj) { - struct page *page = virt_to_page(obj); + struct page *page = virt_to_head_page(obj); return page_get_slab(page); } @@ -678,9 +684,6 @@ static struct kmem_cache cache_cache = { .shared = 1, .buffer_size = sizeof(struct kmem_cache), .name = "kmem_cache", -#if DEBUG - .obj_size = sizeof(struct kmem_cache), -#endif }; #define BAD_ALIEN_MAGIC 0x01020304ul @@ -1223,19 +1226,20 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, */ list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; - struct array_cache *shared; + struct array_cache *shared = NULL; struct array_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount); if (!nc) goto bad; - shared = alloc_arraycache(node, + if (cachep->shared) { + shared = alloc_arraycache(node, cachep->shared * cachep->batchcount, 0xbaadf00d); - if (!shared) - goto bad; - + if (!shared) + goto bad; + } if (use_alien_caches) { alien = alloc_alien_cache(node, cachep->limit); if (!alien) @@ -1317,8 +1321,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, shared = l3->shared; if (shared) { - free_block(cachep, l3->shared->entry, - l3->shared->avail, node); + free_block(cachep, shared->entry, + shared->avail, node); l3->shared = NULL; } @@ -1439,6 +1443,15 @@ void __init kmem_cache_init(void) cache_cache.array[smp_processor_id()] = &initarray_cache.cache; cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; + /* + * struct kmem_cache size depends on nr_node_ids, which + * can be less than MAX_NUMNODES. + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +#if DEBUG + cache_cache.obj_size = cache_cache.buffer_size; +#endif cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); cache_cache.reciprocal_buffer_size = @@ -1932,7 +1945,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) * For setting up all the kmem_list3s for cache whose buffer_size is same as * size of kmem_list3. */ -static void set_up_list3s(struct kmem_cache *cachep, int index) +static void __init set_up_list3s(struct kmem_cache *cachep, int index) { int node; @@ -2154,13 +2167,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ res = probe_kernel_address(pc->name, tmp); if (res) { - printk("SLAB: cache with size %d has lost its name\n", + printk(KERN_ERR + "SLAB: cache with size %d has lost its name\n", pc->buffer_size); continue; } if (!strcmp(pc->name, name)) { - printk("kmem_cache_create: duplicate cache %s\n", name); + printk(KERN_ERR + "kmem_cache_create: duplicate cache %s\n", name); dump_stack(); goto oops; } @@ -2168,12 +2183,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ - if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { - /* No constructor, but inital state check requested */ - printk(KERN_ERR "%s: No con, but init state check " - "requested - %s\n", __FUNCTION__, name); - flags &= ~SLAB_DEBUG_INITIAL; - } #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with @@ -2297,7 +2306,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { - printk("kmem_cache_create: couldn't create cache %s.\n", name); + printk(KERN_ERR + "kmem_cache_create: couldn't create cache %s.\n", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; @@ -2736,19 +2746,10 @@ static int cache_grow(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); - if (flags & __GFP_NO_GROW) - return 0; + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); ctor_flags = SLAB_CTOR_CONSTRUCTOR; local_flags = (flags & GFP_LEVEL_MASK); - if (!(local_flags & __GFP_WAIT)) - /* - * Not allowed to sleep. Need to tell a constructor about - * this - it might need to know... - */ - ctor_flags |= SLAB_CTOR_ATOMIC; - /* Take the l3 list lock to change the colour_next on this node */ check_irq_off(); l3 = cachep->nodelists[nodeid]; @@ -2861,7 +2862,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, objp -= obj_offset(cachep); kfree_debugcheck(objp); - page = virt_to_page(objp); + page = virt_to_head_page(objp); slabp = page_get_slab(page); @@ -2878,15 +2879,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); - if (cachep->flags & SLAB_DEBUG_INITIAL) { - /* - * Need to call the slab's constructor so the caller can - * perform a verify of its state (debugging). Called without - * the cache-lock held. - */ - cachep->ctor(objp + obj_offset(cachep), - cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); - } if (cachep->flags & SLAB_POISON && cachep->dtor) { /* we want to cache poison the object, * call the destruction callback @@ -2990,6 +2982,14 @@ retry: slabp = list_entry(entry, struct slab, list); check_slabp(cachep, slabp); check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(slabp->inuse < 0 || slabp->inuse >= cachep->num); + while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); @@ -3077,20 +3077,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, struct slab *slabp; unsigned objnr; - slabp = page_get_slab(virt_to_page(objp)); + slabp = page_get_slab(virt_to_head_page(objp)); objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; } #endif objp += obj_offset(cachep); - if (cachep->ctor && cachep->flags & SLAB_POISON) { - unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; - - if (!(flags & __GFP_WAIT)) - ctor_flags |= SLAB_CTOR_ATOMIC; - - cachep->ctor(objp, cachep, ctor_flags); - } + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", @@ -3145,7 +3139,7 @@ static int __init failslab_debugfs(void) struct dentry *dir; int err; - err = init_fault_attr_dentries(&failslab.attr, "failslab"); + err = init_fault_attr_dentries(&failslab.attr, "failslab"); if (err) return err; dir = failslab.attr.dentries.dir; @@ -3183,9 +3177,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); - if (should_failslab(cachep, flags)) - return NULL; - ac = cpu_cache_get(cachep); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); @@ -3259,7 +3250,7 @@ retry: flags | GFP_THISNODE, nid); } - if (!obj && !(flags & __GFP_NO_GROW)) { + if (!obj) { /* * This allocation will be performed within the constraints * of the current cpuset / memory policy requirements. @@ -3377,6 +3368,9 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long save_flags; void *ptr; + if (should_failslab(cachep, flags)) + return NULL; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3447,6 +3441,9 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) unsigned long save_flags; void *objp; + if (should_failslab(cachep, flags)) + return NULL; + cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); @@ -3740,6 +3737,53 @@ EXPORT_SYMBOL(__kmalloc); #endif /** + * krealloc - reallocate memory. The contents will remain unchanged. + * + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + struct kmem_cache *cache, *new_cache; + void *ret; + + if (unlikely(!p)) + return kmalloc_track_caller(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + cache = virt_to_cache(p); + new_cache = __find_general_cachep(new_size, flags); + + /* + * If new size fits in the current cache, bail out. + */ + if (likely(cache == new_cache)) + return (void *)p; + + /* + * We are on the slow-path here so do not use __cache_alloc + * because it bloats kernel text. + */ + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ksize(p))); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. * @objp: The previously allocated object. @@ -3815,12 +3859,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep) goto fail; } - new_shared = alloc_arraycache(node, + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount, 0xbaadf00d); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } } l3 = cachep->nodelists[node]; @@ -3978,10 +4025,8 @@ static int enable_cpucache(struct kmem_cache *cachep) * to a larger limit. Thus disabled by default. */ shared = 0; -#ifdef CONFIG_SMP - if (cachep->buffer_size <= PAGE_SIZE) + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) shared = 8; -#endif #if DEBUG /* @@ -4481,7 +4526,7 @@ const struct seq_operations slabstats_op = { * allocated with either kmalloc() or kmem_cache_alloc(). The object * must not be freed during the duration of the call. */ -unsigned int ksize(const void *objp) +size_t ksize(const void *objp) { if (unlikely(objp == NULL)) return 0; diff --git a/mm/slob.c b/mm/slob.c index 5adc29cb58d..c6933bc19bc 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -21,7 +21,7 @@ * * SLAB is emulated on top of SLOB by simply calling constructors and * destructors for every SLAB allocation. Objects are returned with - * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is + * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is * set, in which case the low-level allocator will fragment blocks to * create the proper alignment. Again, objects of page-size or greater * are allocated by calling __get_free_pages. As SLAB objects know @@ -150,15 +150,6 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); } -static int FASTCALL(find_order(int size)); -static int fastcall find_order(int size) -{ - int order = 0; - for ( ; size > 4096 ; size >>=1) - order++; - return order; -} - void *__kmalloc(size_t size, gfp_t gfp) { slob_t *m; @@ -174,7 +165,7 @@ void *__kmalloc(size_t size, gfp_t gfp) if (!bb) return 0; - bb->order = find_order(size); + bb->order = get_order(size); bb->pages = (void *)__get_free_pages(gfp, bb->order); if (bb->pages) { @@ -190,6 +181,39 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!p)) + return kmalloc_track_caller(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + ret = kmalloc_track_caller(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ksize(p))); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + void kfree(const void *block) { bigblock_t *bb, **last = &bigblocks; @@ -219,7 +243,7 @@ void kfree(const void *block) EXPORT_SYMBOL(kfree); -unsigned int ksize(const void *block) +size_t ksize(const void *block) { bigblock_t *bb; unsigned long flags; @@ -262,10 +286,11 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, c->ctor = ctor; c->dtor = dtor; /* ignore alignment unless it's forced */ - c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; + c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; if (c->align < align) c->align = align; - } + } else if (flags & SLAB_PANIC) + panic("Cannot create slab cache %s\n", name); return c; } @@ -284,7 +309,7 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) if (c->size < PAGE_SIZE) b = slob_alloc(c->size, flags, c->align); else - b = (void *)__get_free_pages(flags, find_order(c->size)); + b = (void *)__get_free_pages(flags, get_order(c->size)); if (c->ctor) c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); @@ -311,7 +336,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b) if (c->size < PAGE_SIZE) slob_free(b, c->size); else - free_pages((unsigned long)b, find_order(c->size)); + free_pages((unsigned long)b, get_order(c->size)); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c new file mode 100644 index 00000000000..a6323484dd3 --- /dev/null +++ b/mm/slub.c @@ -0,0 +1,3520 @@ +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks and only + * uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/bit_spinlock.h> +#include <linux/interrupt.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/mempolicy.h> +#include <linux/ctype.h> +#include <linux/kallsyms.h> + +/* + * Lock order: + * 1. slab_lock(page) + * 2. slab->list_lock + * + * The slab_lock protects operations on the object of a particular + * slab and its metadata in the page struct. If the slab lock + * has been taken then no allocations nor frees can be performed + * on the objects in the slab nor can the slab be added or removed + * from the partial or full lists since this would mean modifying + * the page_struct of the slab. + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * + * The lock order is sometimes inverted when we are trying to get a slab + * off a list. We take the list_lock and then look for a page on the list + * to use. While we do that objects in the slabs may be freed. We can + * only operate on the slab if we have also taken the slab_lock. So we use + * a slab_trylock() on the slab. If trylock was successful then no frees + * can occur anymore and we can use the slab for allocations etc. If the + * slab_trylock() does not succeed then frees are in progress in the slab and + * we must stay away from it for a while since we may cause a bouncing + * cacheline if we try to acquire the lock. So go onto the next slab. + * If all pages are busy then we may allocate a new slab instead of reusing + * a partial slab. A new slab has noone operating on it and thus there is + * no danger of cacheline contention. + * + * Interrupts are disabled during allocation and deallocation in order to + * make the slab allocator safe to use in the context of an irq. In addition + * interrupts are disabled to ensure that the processor does not change + * while handling per_cpu slabs, due to kernel preemption. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list. + * There is no list for full slabs. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * Otherwise there is no need to track full slabs unless we have to + * track full slabs for debugging purposes. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * Overloading of page flags that are otherwise used for LRU management. + * + * PageActive The slab is used as a cpu cache. Allocations + * may be performed from the slab. The slab is not + * on any slab list and cannot be moved onto one. + * + * PageError Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path. + */ + +/* + * Issues still to be resolved: + * + * - The per cpu array is updated for each new slab and and is a remote + * cacheline for most nodes. This could become a bouncing cacheline given + * enough frequent updates. There are 16 pointers in a cacheline.so at + * max 16 cpus could compete. Likely okay. + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to test recovery from slab corruption on boot */ +#undef SLUB_RESILIENCY_TEST + +#if PAGE_SHIFT <= 12 + +/* + * Small page size. Make sure that we do not fragment memory + */ +#define DEFAULT_MAX_ORDER 1 +#define DEFAULT_MIN_OBJECTS 4 + +#else + +/* + * Large page machines are customarily able to handle larger + * page orders. + */ +#define DEFAULT_MAX_ORDER 2 +#define DEFAULT_MIN_OBJECTS 8 + +#endif + +/* + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 2 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in the. + */ +#define MAX_PARTIAL 10 + +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) +/* + * Set of flags that will prevent slab merging + */ +#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU) + +#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) +#endif + +/* Internal SLUB flags */ +#define __OBJECT_POISON 0x80000000 /* Poison object */ + +static int kmem_size = sizeof(struct kmem_cache); + +#ifdef CONFIG_SMP +static struct notifier_block slab_notifier; +#endif + +static enum { + DOWN, /* No slab functionality available */ + PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + UP, /* Everything works */ + SYSFS /* Sysfs up */ +} slab_state = DOWN; + +/* A list of all slab caches on the system */ +static DECLARE_RWSEM(slub_lock); +LIST_HEAD(slab_caches); + +#ifdef CONFIG_SYSFS +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +static void sysfs_slab_remove(struct kmem_cache *); +#else +static int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } +static void sysfs_slab_remove(struct kmem_cache *s) {} +#endif + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ +#ifdef CONFIG_NUMA + return s->node[node]; +#else + return &s->local_node; +#endif +} + +/* + * Object debugging + */ +static void print_section(char *text, u8 *addr, unsigned int length) +{ + int i, offset; + int newline = 1; + char ascii[17]; + + ascii[16] = 0; + + for (i = 0; i < length; i++) { + if (newline) { + printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + newline = 0; + } + printk(" %02x", addr[i]); + offset = i % 16; + ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; + if (offset == 15) { + printk(" %s\n",ascii); + newline = 1; + } + } + if (!newline) { + i %= 16; + while (i < 16) { + printk(" "); + ascii[i] = ' '; + i++; + } + printk(" %s\n", ascii); + } +} + +/* + * Slow version of get and set free pointer. + * + * This requires touching the cache lines of kmem_cache. + * The offset can also be obtained from the page. In that + * case it is in the cacheline that we already need to touch. + */ +static void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* + * Tracking user of a slab. + */ +struct track { + void *addr; /* Called from address */ + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + return p + alloc; +} + +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + p += alloc; + if (addr) { + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current ? current->pid : -1; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} + +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (s->flags & SLAB_STORE_USER) { + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); + } +} + +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + + printk(KERN_ERR "%s: ", s); + __print_symbol("%s", (unsigned long)t->addr); + printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); +} + +static void print_trailer(struct kmem_cache *s, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone", p + s->objsize, + s->inuse - s->objsize); + + printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", + p + s->offset, + get_freepointer(s, p)); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; + + if (s->flags & SLAB_STORE_USER) { + print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); + print_track("Last free ", get_track(s, p, TRACK_FREE)); + off += 2 * sizeof(struct track); + } + + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Filler", p + off, s->size - off); +} + +static void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + u8 *addr = page_address(page); + + printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", + s->name, reason, object, page); + printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", + object - addr, page->flags, page->inuse, page->freelist); + if (object > addr + 16) + print_section("Bytes b4", object - 16, 16); + print_section("Object", object, min(s->objsize, 128)); + print_trailer(s, object); + dump_stack(); +} + +static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +{ + va_list args; + char buf[100]; + + va_start(args, reason); + vsnprintf(buf, sizeof(buf), reason, args); + va_end(args); + printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, + page); + dump_stack(); +} + +static void init_object(struct kmem_cache *s, void *object, int active) +{ + u8 *p = object; + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->objsize - 1); + p[s->objsize -1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->objsize, + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, + s->inuse - s->objsize); +} + +static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +{ + while (bytes) { + if (*start != (u8)value) + return 0; + start++; + bytes--; + } + return 1; +} + + +static int check_valid_pointer(struct kmem_cache *s, struct page *page, + void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + s->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->objsize + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended to word size if Redzoning is enabled + * and objsize == inuse. + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * + * If slabcaches are merged then the objsize and inuse boundaries are to + * be ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", + s->name, message, data, from, to - 1); + memset(from, data, to - from); +} + +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ + + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->size == off) + return 1; + + if (check_bytes(p + off, POISON_INUSE, s->size - off)) + return 1; + + object_err(s, page, p, "Object padding check fails"); + + /* + * Restore padding + */ + restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); + return 0; +} + +static int slab_pad_check(struct kmem_cache *s, struct page *page) +{ + u8 *p; + int length, remainder; + + if (!(s->flags & SLAB_POISON)) + return 1; + + p = page_address(page); + length = s->objects * s->size; + remainder = (PAGE_SIZE << s->order) - length; + if (!remainder) + return 1; + + if (!check_bytes(p + length, POISON_INUSE, remainder)) { + slab_err(s, page, "Padding check failed"); + restore_bytes(s, "slab padding", POISON_INUSE, p + length, + p + length + remainder); + return 0; + } + return 1; +} + +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) +{ + u8 *p = object; + u8 *endobject = object + s->objsize; + + if (s->flags & SLAB_RED_ZONE) { + unsigned int red = + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + + if (!check_bytes(endobject, red, s->inuse - s->objsize)) { + object_err(s, page, object, + active ? "Redzone Active" : "Redzone Inactive"); + restore_bytes(s, "redzone", red, + endobject, object + s->inuse); + return 0; + } + } else { + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && + !check_bytes(endobject, POISON_INUSE, + s->inuse - s->objsize)) { + object_err(s, page, p, "Alignment padding check fails"); + /* + * Fix it so that there will not be another report. + * + * Hmmm... We may be corrupting an object that now expects + * to be longer than allowed. + */ + restore_bytes(s, "alignment padding", POISON_INUSE, + endobject, object + s->inuse); + } + } + + if (s->flags & SLAB_POISON) { + if (!active && (s->flags & __OBJECT_POISON) && + (!check_bytes(p, POISON_FREE, s->objsize - 1) || + p[s->objsize - 1] != POISON_END)) { + + object_err(s, page, p, "Poison check failed"); + restore_bytes(s, "Poison", POISON_FREE, + p, p + s->objsize -1); + restore_bytes(s, "Poison", POISON_END, + p + s->objsize - 1, p + s->objsize); + return 0; + } + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } + + if (!s->offset && active) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus loose the remainder + * of the free objects in this slab. May cause + * another error because the object count maybe + * wrong now. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct page *page) +{ + VM_BUG_ON(!irqs_disabled()); + + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page flags=%lx " + "mapping=0x%p count=%d", page->flags, page->mapping, + page_count(page)); + return 0; + } + if (page->offset * sizeof(void *) != s->offset) { + slab_err(s, page, "Corrupted offset %lu flags=0x%lx " + "mapping=0x%p count=%d", + (unsigned long)(page->offset * sizeof(void *)), + page->flags, + page->mapping, + page_count(page)); + return 0; + } + if (page->inuse > s->objects) { + slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " + "mapping=0x%p count=%d", + s->name, page->inuse, s->objects, page->flags, + page->mapping, page_count(page)); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist and + * therefore free. Must hold the slab lock for cpu slabs to + * guarantee that the chains are consistent. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void *fp = page->freelist; + void *object = NULL; + + while (fp && nr <= s->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + break; + } else { + slab_err(s, page, "Freepointer 0x%p corrupt", + fp); + page->freelist = NULL; + page->inuse = s->objects; + printk(KERN_ERR "@@@ SLUB %s: Freelist " + "cleared. Slab 0x%p\n", + s->name, page); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + if (page->inuse != s->objects - nr) { + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", s, page, page->inuse, + s->objects - nr); + page->inuse = s->objects - nr; + printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " + "Slab @0x%p\n", s->name, page); + } + return search == NULL; +} + +/* + * Tracking of fully allocated slabs for debugging + */ +static void add_full(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + list_add(&page->lru, &n->full); + spin_unlock(&n->list_lock); +} + +static void remove_full(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n; + + if (!(s->flags & SLAB_STORE_USER)) + return; + + n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + spin_unlock(&n->list_lock); +} + +static int alloc_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto bad; + + if (object && !on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already allocated", object); + goto bad; + } + + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto bad; + } + + if (!object) + return 1; + + if (!check_object(s, page, object, 0)) + goto bad; + + return 1; +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remainder. + */ + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", + s->name, page); + page->inuse = s->objects; + page->freelist = NULL; + /* Fix up fields that may be corrupted */ + page->offset = s->offset / sizeof(void *); + } + return 0; +} + +static int free_object_checks(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!check_slab(s, page)) + goto fail; + + if (!check_valid_pointer(s, page, object)) { + slab_err(s, page, "Invalid object pointer 0x%p", object); + goto fail; + } + + if (on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already free", object); + goto fail; + } + + if (!check_object(s, page, object, 1)) + return 0; + + if (unlikely(s != page->slab)) { + if (!PageSlab(page)) + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); + else + if (!page->slab) { + printk(KERN_ERR + "SLUB <none>: no slab for object 0x%p.\n", + object); + dump_stack(); + } + else + slab_err(s, page, "object at 0x%p belongs " + "to slab %s", object, page->slab->name); + goto fail; + } + return 1; +fail: + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", + s->name, page, object); + return 0; +} + +/* + * Slab allocation and freeing + */ +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page * page; + int pages = 1 << s->order; + + if (s->order) + flags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + flags |= SLUB_DMA; + + if (node == -1) + page = alloc_pages(flags, s->order); + else + page = alloc_pages_node(node, flags, s->order); + + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + pages); + + return page; +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + if (PageError(page)) { + init_object(s, object, 0); + init_tracking(s, object); + } + + if (unlikely(s->ctor)) + s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR); +} + +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + struct kmem_cache_node *n; + void *start; + void *end; + void *last; + void *p; + + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); + + if (flags & __GFP_WAIT) + local_irq_enable(); + + page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + if (!page) + goto out; + + n = get_node(s, page_to_nid(page)); + if (n) + atomic_long_inc(&n->nr_slabs); + page->offset = s->offset / sizeof(void *); + page->slab = s; + page->flags |= 1 << PG_slab; + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + page->flags |= 1 << PG_error; + + start = page_address(page); + end = start + s->objects * s->size; + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << s->order); + + last = start; + for (p = start + s->size; p < end; p += s->size) { + setup_object(s, page, last); + set_freepointer(s, last, p); + last = p; + } + setup_object(s, page, last); + set_freepointer(s, last, NULL); + + page->freelist = start; + page->inuse = 0; +out: + if (flags & __GFP_WAIT) + local_irq_disable(); + return page; +} + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int pages = 1 << s->order; + + if (unlikely(PageError(page) || s->dtor)) { + void *start = page_address(page); + void *end = start + (pages << PAGE_SHIFT); + void *p; + + slab_pad_check(s, page); + for (p = start; p <= end - s->size; p += s->size) { + if (s->dtor) + s->dtor(p, s, 0); + check_object(s, page, p, 0); + } + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + - pages); + + page->mapping = NULL; + __free_pages(page, s->order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); +} + +static void free_slab(struct kmem_cache *s, struct page *page) +{ + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + /* + * RCU free overloads the RCU head over the LRU + */ + struct rcu_head *head = (void *)&page->lru; + + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); +} + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + atomic_long_dec(&n->nr_slabs); + reset_page_mapcount(page); + page->flags &= ~(1 << PG_slab | 1 << PG_error); + free_slab(s, page); +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + bit_spin_unlock(PG_locked, &page->flags); +} + +static __always_inline int slab_trylock(struct page *page) +{ + int rc = 1; + + rc = bit_spin_trylock(PG_locked, &page->flags); + return rc; +} + +/* + * Management of partially allocated slabs + */ +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void remove_partial(struct kmem_cache *s, + struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} + +/* + * Lock page and remove it from the partial list + * + * Must hold list_lock + */ +static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + return 1; + } + return 0; +} + +/* + * Try to get a partial slab from a specific node + */ +static struct page *get_partial_node(struct kmem_cache_node *n) +{ + struct page *page; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab then get_partials() will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; +} + +/* + * Get a page from somewhere. Search in increasing NUMA + * distances. + */ +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zone **z; + struct page *page; + + /* + * The defrag ratio allows to configure the tradeoffs between + * inter node defragmentation and node local allocations. + * A lower defrag_ratio increases the tendency to do local + * allocations instead of scanning throught the partial + * lists on other nodes. + * + * If defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If its higher then kmalloc() + * may return off node objects in order to avoid fragmentation. + * + * A higher ratio means slabs may be taken from other nodes + * thus reducing the number of partial slabs on those nodes. + * + * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation + * will first attempt to defrag slab caches on other nodes. This + * means scanning over all nodes to look for partial slabs which + * may be a bit expensive to do on every slab allocation. + */ + if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) + return NULL; + + zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + for (z = zonelist->zones; *z; z++) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(*z)); + + if (n && cpuset_zone_allowed_hardwall(*z, flags) && + n->nr_partial > MIN_PARTIAL) { + page = get_partial_node(n); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; + + page = get_partial_node(get_node(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + return get_any_partial(s, flags); +} + +/* + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. + */ +static void putback_slab(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + if (page->inuse) { + + if (page->freelist) + add_partial(n, page); + else if (PageError(page) && (s->flags & SLAB_STORE_USER)) + add_full(n, page); + slab_unlock(page); + + } else { + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty page to the partial slabs in order + * to avoid page allocator overhead. This page needs to + * come after all the others that are not fully empty + * in order to make sure that we do maximum + * defragmentation. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } + } +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + s->cpu_slab[cpu] = NULL; + ClearPageActive(page); + + putback_slab(s, page); +} + +static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +{ + slab_lock(page); + deactivate_slab(s, page, cpu); +} + +/* + * Flush cpu slab. + * Called from IPI handler with interrupts disabled. + */ +static void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct page *page = s->cpu_slab[cpu]; + + if (likely(page)) + flush_slab(s, page, cpu); +} + +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + int cpu = smp_processor_id(); + + __flush_cpu_slab(s, cpu); +} + +static void flush_all(struct kmem_cache *s) +{ +#ifdef CONFIG_SMP + on_each_cpu(flush_cpu_slab, s, 1, 1); +#else + unsigned long flags; + + local_irq_save(flags); + flush_cpu_slab(s); + local_irq_restore(flags); +#endif +} + +/* + * slab_alloc is optimized to only modify two cachelines on the fast path + * (aside from the stack): + * + * 1. The page struct + * 2. The first cacheline of the object to be allocated. + * + * The only cache lines that are read (apart from code) is the + * per cpu array in the kmem_cache struct. + * + * Fastpath is not possible if we need to get a new slab or have + * debugging enabled (which means all slabs are marked with PageError) + */ +static void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) +{ + struct page *page; + void **object; + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + page = s->cpu_slab[cpu]; + if (!page) + goto new_slab; + + slab_lock(page); + if (unlikely(node != -1 && page_to_nid(page) != node)) + goto another_slab; +redo: + object = page->freelist; + if (unlikely(!object)) + goto another_slab; + if (unlikely(PageError(page))) + goto debug; + +have_object: + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + local_irq_restore(flags); + return object; + +another_slab: + deactivate_slab(s, page, cpu); + +new_slab: + page = get_partial(s, gfpflags, node); + if (likely(page)) { +have_slab: + s->cpu_slab[cpu] = page; + SetPageActive(page); + goto redo; + } + + page = new_slab(s, gfpflags, node); + if (page) { + cpu = smp_processor_id(); + if (s->cpu_slab[cpu]) { + /* + * Someone else populated the cpu_slab while we enabled + * interrupts, or we have got scheduled on another cpu. + * The page may not be on the requested node. + */ + if (node == -1 || + page_to_nid(s->cpu_slab[cpu]) == node) { + /* + * Current cpuslab is acceptable and we + * want the current one since its cache hot + */ + discard_slab(s, page); + page = s->cpu_slab[cpu]; + slab_lock(page); + goto redo; + } + /* Dump the current slab */ + flush_slab(s, s->cpu_slab[cpu], cpu); + } + slab_lock(page); + goto have_slab; + } + local_irq_restore(flags); + return NULL; +debug: + if (!alloc_object_checks(s, page, object)) + goto another_slab; + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", + s->name, object, page->inuse, + page->freelist); + dump_stack(); + } + init_object(s, object, 1); + goto have_object; +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif + +/* + * The fastpath only writes the cacheline of the page struct and the first + * cacheline of the object. + * + * No special cachelines need to be read + */ +static void slab_free(struct kmem_cache *s, struct page *page, + void *x, void *addr) +{ + void *prior; + void **object = (void *)x; + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + + if (unlikely(PageError(page))) + goto debug; +checks_ok: + prior = object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; + + if (unlikely(PageActive(page))) + /* + * Cpu slabs are never on partial lists and are + * never freed. + */ + goto out_unlock; + + if (unlikely(!page->inuse)) + goto slab_empty; + + /* + * Objects left in the slab. If it + * was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) + add_partial(get_node(s, page_to_nid(page)), page); + +out_unlock: + slab_unlock(page); + local_irq_restore(flags); + return; + +slab_empty: + if (prior) + /* + * Slab on the partial list. + */ + remove_partial(s, page); + + slab_unlock(page); + discard_slab(s, page); + local_irq_restore(flags); + return; + +debug: + if (!free_object_checks(s, page, x)) + goto out_unlock; + if (!PageActive(page) && !page->freelist) + remove_full(s, page); + if (s->flags & SLAB_STORE_USER) + set_track(s, x, TRACK_FREE, addr); + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", + s->name, object, page->inuse, + page->freelist); + print_section("Object", (void *)object, s->objsize); + dump_stack(); + } + init_object(s, object, 0); + goto checks_ok; +} + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + struct page *page; + + page = virt_to_head_page(x); + + slab_free(s, page, x, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_free); + +/* Figure out on which slab object the object resides */ +static struct page *get_object_page(const void *x) +{ + struct page *page = virt_to_head_page(x); + + if (!PageSlab(page)) + return NULL; + + return page; +} + +/* + * kmem_cache_open produces objects aligned at "size" and the first object + * is placed at offset 0 in the slab (We have no metainformation on the + * slab, all slabs are in essence "off slab"). + * + * In order to get the desired alignment one just needs to align the + * size. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and therefore may influence + * locking overhead. + * + * The offset is used to relocate the free list link in each object. It is + * therefore possible to move the free list link behind the object. This + * is necessary for RCU to work properly and also useful for debugging. + */ + +/* + * Mininum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slub_min_order; +static int slub_max_order = DEFAULT_MAX_ORDER; + +/* + * Minimum number of objects per slab. This is necessary in order to + * reduce locking overhead. Similar to the queue size in SLAB. + */ +static int slub_min_objects = DEFAULT_MIN_OBJECTS; + +/* + * Merge control. If this is set then no merging of slab caches will occur. + */ +static int slub_nomerge; + +/* + * Debug settings: + */ +static int slub_debug; + +static char *slub_debug_slabs; + +/* + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on other elements + * of the system. Generally order 0 allocations should be preferred + * since they do not cause fragmentation in the page allocator. Larger + * objects may have problems with order 0 because there may be too much + * space left unused in a slab. We go to a higher order if more than 1/8th + * of the slab would be wasted. + * + * In order to reach satisfactory performance we must ensure that + * a minimum number of objects is in one slab. Otherwise we may + * generate too much activity on the partial lists. This is less a + * concern for large slabs though. slub_max_order specifies the order + * where we begin to stop considering the number of objects in a slab. + * + * Higher order allocations also allow the placement of more objects + * in a slab and thereby reduce object handling overhead. If the user + * has requested a higher mininum order then we start with that one + * instead of zero. + */ +static int calculate_order(int size) +{ + int order; + int rem; + + for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); + order < MAX_ORDER; order++) { + unsigned long slab_size = PAGE_SIZE << order; + + if (slub_max_order > order && + slab_size < slub_min_objects * size) + continue; + + if (slab_size < size) + continue; + + rem = slab_size % size; + + if (rem <= (PAGE_SIZE << order) / 8) + break; + + } + if (order >= MAX_ORDER) + return -E2BIG; + return order; +} + +/* + * Function to figure out which alignment to use from the + * various ways of specifying it. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then + * follow that suggestion if the object is sufficiently + * large. + * + * The hardware cache alignment cannot override the + * specified alignment though. If that is greater + * then use it. + */ + if ((flags & SLAB_HWCACHE_ALIGN) && + size > L1_CACHE_BYTES / 2) + return max_t(unsigned long, align, L1_CACHE_BYTES); + + if (align < ARCH_SLAB_MINALIGN) + return ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static void init_kmem_cache_node(struct kmem_cache_node *n) +{ + n->nr_partial = 0; + atomic_long_set(&n->nr_slabs, 0); + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + INIT_LIST_HEAD(&n->full); +} + +#ifdef CONFIG_NUMA +/* + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmalloc_node_cache + * when allocating for the kmalloc_node_cache. + */ +static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) +{ + struct page *page; + struct kmem_cache_node *n; + + BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + + page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + /* new_slab() disables interupts */ + local_irq_enable(); + + BUG_ON(!page); + n = page->freelist; + BUG_ON(!n); + page->freelist = get_freepointer(kmalloc_caches, n); + page->inuse++; + kmalloc_caches->node[node] = n; + init_object(kmalloc_caches, n, 1); + init_kmem_cache_node(n); + atomic_long_inc(&n->nr_slabs); + add_partial(n, page); + return n; +} + +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + if (n && n != &s->local_node) + kmem_cache_free(kmalloc_caches, n); + s->node[node] = NULL; + } +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + int node; + int local_node; + + if (slab_state >= UP) + local_node = page_to_nid(virt_to_page(s)); + else + local_node = 0; + + for_each_online_node(node) { + struct kmem_cache_node *n; + + if (local_node == node) + n = &s->local_node; + else { + if (slab_state == DOWN) { + n = early_kmem_cache_node_alloc(gfpflags, + node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } + + } + s->node[node] = n; + init_kmem_cache_node(n); + } + return 1; +} +#else +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ +} + +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + init_kmem_cache_node(&s->local_node); + return 1; +} +#endif + +/* + * calculate_sizes() determines the order and the distribution of data within + * a slab object. + */ +static int calculate_sizes(struct kmem_cache *s) +{ + unsigned long flags = s->flags; + unsigned long size = s->objsize; + unsigned long align = s->align; + + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + !s->ctor && !s->dtor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; + + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + + /* + * If we are redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word, so that we can establish a redzone between + * the object and the freepointer to be able to check for overwrites. + */ + if ((flags & SLAB_RED_ZONE) && size == s->objsize) + size += sizeof(void *); + + /* + * With that we have determined how much of the slab is in actual + * use by the object. This is the potential offset to the free + * pointer. + */ + s->inuse = size; + + if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + s->ctor || s->dtor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor or are poisoning the objects. + */ + s->offset = size; + size += sizeof(void *); + } + + if (flags & SLAB_STORE_USER) + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + if (flags & DEBUG_DEFAULT_FLAGS) + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if an user writes before the start + * of the object. + */ + size += sizeof(void *); + /* + * Determine the alignment based on various parameters that the + * user specified (this is unecessarily complex due to the attempt + * to be compatible with SLAB. Should be cleaned up some day). + */ + align = calculate_alignment(flags, align, s->objsize); + + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, align); + s->size = size; + + s->order = calculate_order(size); + if (s->order < 0) + return 0; + + /* + * Determine the number of objects per slab + */ + s->objects = (PAGE_SIZE << s->order) / size; + + /* + * Verify that the number of objects is within permitted limits. + * The page->inuse field is only 16 bit wide! So we cannot have + * more than 64k objects per slab. + */ + if (!s->objects || s->objects > 65535) + return 0; + return 1; + +} + +static int __init finish_bootstrap(void) +{ + struct list_head *h; + int err; + + slab_state = SYSFS; + + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + err = sysfs_slab_add(s); + BUG_ON(err); + } + return 0; +} + +static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, + const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + memset(s, 0, kmem_size); + s->name = name; + s->ctor = ctor; + s->dtor = dtor; + s->objsize = size; + s->flags = flags; + s->align = align; + + /* + * The page->offset field is only 16 bit wide. This is an offset + * in units of words from the beginning of an object. If the slab + * size is bigger then we cannot move the free pointer behind the + * object anymore. + * + * On 32 bit platforms the limit is 256k. On 64bit platforms + * the limit is 512k. + * + * Debugging or ctor/dtors may create a need to move the free + * pointer. Fail if this happens. + */ + if (s->size >= 65535 * sizeof(void *)) { + BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); + BUG_ON(ctor || dtor); + } + else + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, name, + strlen(slub_debug_slabs)) == 0)) + s->flags |= slub_debug; + + if (!calculate_sizes(s)) + goto error; + + s->refcount = 1; +#ifdef CONFIG_NUMA + s->defrag_ratio = 100; +#endif + + if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + return 1; +error: + if (flags & SLAB_PANIC) + panic("Cannot create slab %s size=%lu realsize=%u " + "order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)size, s->size, s->order, + s->offset, flags); + return 0; +} +EXPORT_SYMBOL(kmem_cache_open); + +/* + * Check if a given pointer is valid + */ +int kmem_ptr_validate(struct kmem_cache *s, const void *object) +{ + struct page * page; + void *addr; + + page = get_object_page(object); + + if (!page || s != page->slab) + /* No slab or wrong slab */ + return 0; + + addr = page_address(page); + if (object < addr || object >= addr + s->objects * s->size) + /* Out of bounds */ + return 0; + + if ((object - addr) % s->size) + /* Improperly aligned */ + return 0; + + /* + * We could also check if the object is on the slabs freelist. + * But this would be too expensive and it seems that the main + * purpose of kmem_ptr_valid is to check if the object belongs + * to a certain slab. + */ + return 1; +} +EXPORT_SYMBOL(kmem_ptr_validate); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->objsize; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *s) +{ + return s->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +/* + * Attempt to free all slabs on a node + */ +static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, + struct list_head *list) +{ + int slabs_inuse = 0; + unsigned long flags; + struct page *page, *h; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, h, list, lru) + if (!page->inuse) { + list_del(&page->lru); + discard_slab(s, page); + } else + slabs_inuse++; + spin_unlock_irqrestore(&n->list_lock, flags); + return slabs_inuse; +} + +/* + * Release all resources used by slab cache + */ +static int kmem_cache_close(struct kmem_cache *s) +{ + int node; + + flush_all(s); + + /* Attempt to free all objects */ + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + n->nr_partial -= free_list(s, n, &n->partial); + if (atomic_long_read(&n->nr_slabs)) + return 1; + } + free_kmem_cache_nodes(s); + return 0; +} + +/* + * Close a cache and release the kmem_cache structure + * (must be used for caches created using kmem_cache_create) + */ +void kmem_cache_destroy(struct kmem_cache *s) +{ + down_write(&slub_lock); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + if (kmem_cache_close(s)) + WARN_ON(1); + sysfs_slab_remove(s); + kfree(s); + } + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +EXPORT_SYMBOL(kmalloc_caches); + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; +#endif + +static int __init setup_slub_min_order(char *str) +{ + get_option (&str, &slub_min_order); + + return 1; +} + +__setup("slub_min_order=", setup_slub_min_order); + +static int __init setup_slub_max_order(char *str) +{ + get_option (&str, &slub_max_order); + + return 1; +} + +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) +{ + get_option (&str, &slub_min_objects); + + return 1; +} + +__setup("slub_min_objects=", setup_slub_min_objects); + +static int __init setup_slub_nomerge(char *str) +{ + slub_nomerge = 1; + return 1; +} + +__setup("slub_nomerge", setup_slub_nomerge); + +static int __init setup_slub_debug(char *str) +{ + if (!str || *str != '=') + slub_debug = DEBUG_DEFAULT_FLAGS; + else { + str++; + if (*str == 0 || *str == ',') + slub_debug = DEBUG_DEFAULT_FLAGS; + else + for( ;*str && *str != ','; str++) + switch (*str) { + case 'f' : case 'F' : + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z' : case 'Z' : + slub_debug |= SLAB_RED_ZONE; + break; + case 'p' : case 'P' : + slub_debug |= SLAB_POISON; + break; + case 'u' : case 'U' : + slub_debug |= SLAB_STORE_USER; + break; + case 't' : case 'T' : + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } + } + + if (*str == ',') + slub_debug_slabs = str + 1; + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, + const char *name, int size, gfp_t gfp_flags) +{ + unsigned int flags = 0; + + if (gfp_flags & SLUB_DMA) + flags = SLAB_CACHE_DMA; + + down_write(&slub_lock); + if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + flags, NULL, NULL)) + goto panic; + + list_add(&s->list, &slab_caches); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto panic; + return s; + +panic: + panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); +} + +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index = kmalloc_index(size); + + if (!index) + return NULL; + + /* Allocation too large? */ + BUG_ON(index < 0); + +#ifdef CONFIG_ZONE_DMA + if ((flags & SLUB_DMA)) { + struct kmem_cache *s; + struct kmem_cache *x; + char *text; + size_t realsize; + + s = kmalloc_caches_dma[index]; + if (s) + return s; + + /* Dynamically create dma cache */ + x = kmalloc(kmem_size, flags & ~SLUB_DMA); + if (!x) + panic("Unable to allocate memory for dma cache\n"); + + if (index <= KMALLOC_SHIFT_HIGH) + realsize = 1 << index; + else { + if (index == 1) + realsize = 96; + else + realsize = 192; + } + + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = create_kmalloc_cache(x, text, realsize, flags); + kmalloc_caches_dma[index] = s; + return s; + } +#endif + return &kmalloc_caches[index]; +} + +void *__kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *s = get_slab(size, flags); + + if (s) + return slab_alloc(s, flags, -1, __builtin_return_address(0)); + return NULL; +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + struct kmem_cache *s = get_slab(size, flags); + + if (s) + return slab_alloc(s, flags, node, __builtin_return_address(0)); + return NULL; +} +EXPORT_SYMBOL(__kmalloc_node); +#endif + +size_t ksize(const void *object) +{ + struct page *page = get_object_page(object); + struct kmem_cache *s; + + BUG_ON(!page); + s = page->slab; + BUG_ON(!s); + + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} +EXPORT_SYMBOL(ksize); + +void kfree(const void *x) +{ + struct kmem_cache *s; + struct page *page; + + if (!x) + return; + + page = virt_to_head_page(x); + s = page->slab; + + slab_free(s, page, (void *)x, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kfree); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists + * and then sorts the partially allocated slabs by the number + * of items in use. The slabs with the most items in use + * come first. New allocations will remove these from the + * partial list because they are full. The slabs with the + * least items are placed last. If it happens that the objects + * are freed then the page can be returned to the page allocator. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + unsigned long flags; + + if (!slabs_by_inuse) + return -ENOMEM; + + flush_all(s); + for_each_online_node(node) { + n = get_node(s, node); + + if (!n->nr_partial) + continue; + + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists indexed by the items in use in + * each slab or free slabs if empty. + * + * Note that concurrent frees may occur while + * we hold the list_lock. page->inuse here is + * the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + if (!page->inuse && slab_trylock(page)) { + /* + * Must hold slab lock here because slab_free + * may have freed the last object and be + * waiting to release the slab. + */ + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + } else { + if (n->nr_partial > MAX_PARTIAL) + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + } + + if (n->nr_partial <= MAX_PARTIAL) + goto out; + + /* + * Rebuild the partial list with the slabs filled up + * most first and the least used slabs at the end. + */ + for (i = s->objects - 1; i >= 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); + + out: + spin_unlock_irqrestore(&n->list_lock, flags); + } + + kfree(slabs_by_inuse); + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + struct kmem_cache *new_cache; + void *ret; + struct page *page; + + if (unlikely(!p)) + return kmalloc(new_size, flags); + + if (unlikely(!new_size)) { + kfree(p); + return NULL; + } + + page = virt_to_head_page(p); + + new_cache = get_slab(new_size, flags); + + /* + * If new size fits in the current cache, bail out. + */ + if (likely(page->slab == new_cache)) + return (void *)p; + + ret = kmalloc(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ksize(p))); + kfree(p); + } + return ret; +} +EXPORT_SYMBOL(krealloc); + +/******************************************************************** + * Basic setup of slabs + *******************************************************************/ + +void __init kmem_cache_init(void) +{ + int i; + +#ifdef CONFIG_NUMA + /* + * Must first have the slab cache available for the allocations of the + * struct kmalloc_cache_node's. There is special bootstrap code in + * kmem_cache_open for slab_state == DOWN. + */ + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_KERNEL); +#endif + + /* Able to allocate the per node structures */ + slab_state = PARTIAL; + + /* Caches that are not of the two-to-the-power-of size */ + create_kmalloc_cache(&kmalloc_caches[1], + "kmalloc-96", 96, GFP_KERNEL); + create_kmalloc_cache(&kmalloc_caches[2], + "kmalloc-192", 192, GFP_KERNEL); + + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); + + slab_state = UP; + + /* Provide the correct kmalloc names now that the caches are up */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + +#ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); +#endif + + if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */ + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct page *); + + printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + " Processors=%d, Nodes=%d\n", + KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); +} + +/* + * Find a mergeable slab cache + */ +static int slab_unmergeable(struct kmem_cache *s) +{ + if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) + return 1; + + if (s->ctor || s->dtor) + return 1; + + return 0; +} + +static struct kmem_cache *find_mergeable(size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + struct list_head *h; + + if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) + return NULL; + + if (ctor || dtor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if (((flags | slub_debug) & SLUB_MERGE_SAME) != + (s->flags & SLUB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align -1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + return s; + } + return NULL; +} + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + struct kmem_cache *s; + + down_write(&slub_lock); + s = find_mergeable(size, align, flags, dtor, ctor); + if (s) { + s->refcount++; + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->objsize = max(s->objsize, (int)size); + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + if (sysfs_slab_alias(s, name)) + goto err; + } else { + s = kmalloc(kmem_size, GFP_KERNEL); + if (s && kmem_cache_open(s, GFP_KERNEL, name, + size, align, flags, ctor, dtor)) { + if (sysfs_slab_add(s)) { + kfree(s); + goto err; + } + list_add(&s->list, &slab_caches); + } else + kfree(s); + } + up_write(&slub_lock); + return s; + +err: + up_write(&slub_lock); + if (flags & SLAB_PANIC) + panic("Cannot create slabcache %s\n", name); + else + s = NULL; + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) +{ + void *x; + + x = slab_alloc(s, flags, -1, __builtin_return_address(0)); + if (x) + memset(x, 0, s->objsize); + return x; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +#ifdef CONFIG_SMP +static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) +{ + struct list_head *h; + + down_read(&slub_lock); + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + func(s, cpu); + } + up_read(&slub_lock); +} + +/* + * Use the cpu notifier to insure that the slab are flushed + * when necessary. + */ +static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_DEAD: + for_all_slabs(__flush_cpu_slab, cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata slab_notifier = + { &slab_cpuup_callback, NULL, 0 }; + +#endif + +#ifdef CONFIG_NUMA + +/***************************************************************** + * Generic reaper used to support the page allocator + * (the cpu slabs are reaped by a per slab workqueue). + * + * Maybe move this to the page allocator? + ****************************************************************/ + +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + + __get_cpu_var(reap_node) = node; +} + +static void next_reap_node(void) +{ + int node = __get_cpu_var(reap_node); + + /* + * Also drain per cpu pages on remote zones + */ + if (node != numa_node_id()) + drain_node_pages(node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; +} +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + +#define REAPTIMEOUT_CPUC (2*HZ) + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, reap_work); + +static void cache_reap(struct work_struct *unused) +{ + next_reap_node(); + refresh_cpu_vm_stats(smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), + REAPTIMEOUT_CPUC); +} + +static void __devinit start_cpu_timer(int cpu) +{ + struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->work.func == NULL) { + init_reap_node(cpu); + INIT_DELAYED_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + } +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that drain pcp pages and update vm statistics + */ + for_each_online_cpu(cpu) + start_cpu_timer(cpu); + return 0; +} +__initcall(cpucache_init); +#endif + +#ifdef SLUB_RESILIENCY_TEST +static unsigned long validate_slab_cache(struct kmem_cache *s); + +static void resiliency_test(void) +{ + u8 *p; + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches + 4); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches + 5); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches + 6); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 7); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 8); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches + 9); +} +#else +static void resiliency_test(void) {}; +#endif + +/* + * These are not as efficient as kmalloc for the non debug case. + * We do not have the page struct available so we have to touch one + * cacheline in struct kmem_cache to check slab flags. + */ +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +{ + struct kmem_cache *s = get_slab(size, gfpflags); + + if (!s) + return NULL; + + return slab_alloc(s, gfpflags, -1, caller); +} + +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, void *caller) +{ + struct kmem_cache *s = get_slab(size, gfpflags); + + if (!s) + return NULL; + + return slab_alloc(s, gfpflags, node, caller); +} + +#ifdef CONFIG_SYSFS + +static int validate_slab(struct kmem_cache *s, struct page *page) +{ + void *p; + void *addr = page_address(page); + unsigned long map[BITS_TO_LONGS(s->objects)]; + + if (!check_slab(s, page) || + !on_freelist(s, page, NULL)) + return 0; + + /* Now we know that a valid freelist exists */ + bitmap_zero(map, s->objects); + + for(p = page->freelist; p; p = get_freepointer(s, p)) { + set_bit((p - addr) / s->size, map); + if (!check_object(s, page, p, 0)) + return 0; + } + + for(p = addr; p < addr + s->objects * s->size; p += s->size) + if (!test_bit((p - addr) / s->size, map)) + if (!check_object(s, page, p, 1)) + return 0; + return 1; +} + +static void validate_slab_slab(struct kmem_cache *s, struct page *page) +{ + if (slab_trylock(page)) { + validate_slab(s, page); + slab_unlock(page); + } else + printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", + s->name, page); + + if (s->flags & DEBUG_DEFAULT_FLAGS) { + if (!PageError(page)) + printk(KERN_ERR "SLUB %s: PageError not set " + "on slab 0x%p\n", s->name, page); + } else { + if (PageError(page)) + printk(KERN_ERR "SLUB %s: PageError set on " + "slab 0x%p\n", s->name, page); + } +} + +static int validate_slab_node(struct kmem_cache *s, struct kmem_cache_node *n) +{ + unsigned long count = 0; + struct page *page; + unsigned long flags; + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, lru) { + validate_slab_slab(s, page); + count++; + } + if (count != n->nr_partial) + printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " + "counter=%ld\n", s->name, count, n->nr_partial); + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(page, &n->full, lru) { + validate_slab_slab(s, page); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) + printk(KERN_ERR "SLUB: %s %ld slabs counted but " + "counter=%ld\n", s->name, count, + atomic_long_read(&n->nr_slabs)); + +out: + spin_unlock_irqrestore(&n->list_lock, flags); + return count; +} + +static unsigned long validate_slab_cache(struct kmem_cache *s) +{ + int node; + unsigned long count = 0; + + flush_all(s); + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + count += validate_slab_node(s, n); + } + return count; +} + +/* + * Generate lists of locations where slabcache objects are allocated + * and freed. + */ + +struct location { + unsigned long count; + void *addr; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; +}; + +static void free_loc_track(struct loc_track *t) +{ + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} + +static int alloc_loc_track(struct loc_track *t, unsigned long max) +{ + struct location *l; + int order; + + if (!max) + max = PAGE_SIZE / sizeof(struct location); + + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(GFP_KERNEL, order); + + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; +} + +static int add_location(struct loc_track *t, struct kmem_cache *s, + void *addr) +{ + long start, end, pos; + struct location *l; + void *caddr; + + start = -1; + end = t->count; + + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + caddr = t->loc[pos].addr; + if (addr == caddr) { + t->loc[pos].count++; + return 1; + } + + if (addr < caddr) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = addr; + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct page *page, enum track_item alloc) +{ + void *addr = page_address(page); + unsigned long map[BITS_TO_LONGS(s->objects)]; + void *p; + + bitmap_zero(map, s->objects); + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit((p - addr) / s->size, map); + + for (p = addr; p < addr + s->objects * s->size; p += s->size) + if (!test_bit((p - addr) / s->size, map)) { + void *addr = get_track(s, p, alloc)->addr; + + add_location(t, s, addr); + } +} + +static int list_locations(struct kmem_cache *s, char *buf, + enum track_item alloc) +{ + int n = 0; + unsigned long i; + struct loc_track t; + int node; + + t.count = 0; + t.max = 0; + + /* Push back cpu slabs */ + flush_all(s); + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + unsigned long flags; + struct page *page; + + if (!atomic_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, lru) + process_slab(&t, s, page, alloc); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { + void *addr = t.loc[i].addr; + + if (n > PAGE_SIZE - 100) + break; + n += sprintf(buf + n, "%7ld ", t.loc[i].count); + if (addr) + n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); + else + n += sprintf(buf + n, "<not-available>"); + n += sprintf(buf + n, "\n"); + } + + free_loc_track(&t); + if (!t.count) + n += sprintf(buf, "No data\n"); + return n; +} + +static unsigned long count_partial(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += page->inuse; + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +enum slab_stat_type { + SL_FULL, + SL_PARTIAL, + SL_CPU, + SL_OBJECTS +}; + +#define SO_FULL (1 << SL_FULL) +#define SO_PARTIAL (1 << SL_PARTIAL) +#define SO_CPU (1 << SL_CPU) +#define SO_OBJECTS (1 << SL_OBJECTS) + +static unsigned long slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) +{ + unsigned long total = 0; + int cpu; + int node; + int x; + unsigned long *nodes; + unsigned long *per_cpu; + + nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + per_cpu = nodes + nr_node_ids; + + for_each_possible_cpu(cpu) { + struct page *page = s->cpu_slab[cpu]; + int node; + + if (page) { + node = page_to_nid(page); + if (flags & SO_CPU) { + int x = 0; + + if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + total += x; + nodes[node] += x; + } + per_cpu[node]++; + } + } + + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_PARTIAL) { + if (flags & SO_OBJECTS) + x = count_partial(n); + else + x = n->nr_partial; + total += x; + nodes[node] += x; + } + + if (flags & SO_FULL) { + int full_slabs = atomic_read(&n->nr_slabs) + - per_cpu[node] + - n->nr_partial; + + if (flags & SO_OBJECTS) + x = full_slabs * s->objects; + else + x = full_slabs; + total += x; + nodes[node] += x; + } + } + + x = sprintf(buf, "%lu", total); +#ifdef CONFIG_NUMA + for_each_online_node(node) + if (nodes[node]) + x += sprintf(buf + x, " N%d=%lu", + node, nodes[node]); +#endif + kfree(nodes); + return x + sprintf(buf + x, "\n"); +} + +static int any_slab_objects(struct kmem_cache *s) +{ + int node; + int cpu; + + for_each_possible_cpu(cpu) + if (s->cpu_slab[cpu]) + return 1; + + for_each_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + if (n->nr_partial || atomic_read(&n->nr_slabs)) + return 1; + } + return 0; +} + +#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab(n) container_of(n, struct kmem_cache, kobj); + +struct slab_attribute { + struct attribute attr; + ssize_t (*show)(struct kmem_cache *s, char *buf); + ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); +}; + +#define SLAB_ATTR_RO(_name) \ + static struct slab_attribute _name##_attr = __ATTR_RO(_name) + +#define SLAB_ATTR(_name) \ + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t slab_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->size); +} +SLAB_ATTR_RO(slab_size); + +static ssize_t align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->align); +} +SLAB_ATTR_RO(align); + +static ssize_t object_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->objsize); +} +SLAB_ATTR_RO(object_size); + +static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->objects); +} +SLAB_ATTR_RO(objs_per_slab); + +static ssize_t order_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->order); +} +SLAB_ATTR_RO(order); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (s->ctor) { + int n = sprint_symbol(buf, (unsigned long)s->ctor); + + return n + sprintf(buf + n, "\n"); + } + return 0; +} +SLAB_ATTR_RO(ctor); + +static ssize_t dtor_show(struct kmem_cache *s, char *buf) +{ + if (s->dtor) { + int n = sprint_symbol(buf, (unsigned long)s->dtor); + + return n + sprintf(buf + n, "\n"); + } + return 0; +} +SLAB_ATTR_RO(dtor); + +static ssize_t aliases_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->refcount - 1); +} +SLAB_ATTR_RO(aliases); + +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); +} +SLAB_ATTR_RO(slabs); + +static ssize_t partial_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_PARTIAL); +} +SLAB_ATTR_RO(partial); + +static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_CPU); +} +SLAB_ATTR_RO(cpu_slabs); + +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); +} + +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') + s->flags |= SLAB_DEBUG_FREE; + return length; +} +SLAB_ATTR(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} + +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') + s->flags |= SLAB_TRACE; + return length; +} +SLAB_ATTR(trace); + +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} + +static ssize_t reclaim_account_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_RECLAIM_ACCOUNT; + if (buf[0] == '1') + s->flags |= SLAB_RECLAIM_ACCOUNT; + return length; +} +SLAB_ATTR(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +static ssize_t red_zone_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); +} + +static ssize_t red_zone_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_RED_ZONE; + if (buf[0] == '1') + s->flags |= SLAB_RED_ZONE; + calculate_sizes(s); + return length; +} +SLAB_ATTR(red_zone); + +static ssize_t poison_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); +} + +static ssize_t poison_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_POISON; + if (buf[0] == '1') + s->flags |= SLAB_POISON; + calculate_sizes(s); + return length; +} +SLAB_ATTR(poison); + +static ssize_t store_user_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); +} + +static ssize_t store_user_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_STORE_USER; + if (buf[0] == '1') + s->flags |= SLAB_STORE_USER; + calculate_sizes(s); + return length; +} +SLAB_ATTR(store_user); + +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') + validate_slab_cache(s); + else + return -EINVAL; + return length; +} +SLAB_ATTR(validate); + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') { + int rc = kmem_cache_shrink(s); + + if (rc) + return rc; + } else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); + +#ifdef CONFIG_NUMA +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio / 10); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int n = simple_strtoul(buf, NULL, 10); + + if (n < 100) + s->defrag_ratio = n * 10; + return length; +} +SLAB_ATTR(defrag_ratio); +#endif + +static struct attribute * slab_attrs[] = { + &slab_size_attr.attr, + &object_size_attr.attr, + &objs_per_slab_attr.attr, + &order_attr.attr, + &objects_attr.attr, + &slabs_attr.attr, + &partial_attr.attr, + &cpu_slabs_attr.attr, + &ctor_attr.attr, + &dtor_attr.attr, + &aliases_attr.attr, + &align_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, + &hwcache_align_attr.attr, + &reclaim_account_attr.attr, + &destroy_by_rcu_attr.attr, + &red_zone_attr.attr, + &poison_attr.attr, + &store_user_attr.attr, + &validate_attr.attr, + &shrink_attr.attr, + &alloc_calls_attr.attr, + &free_calls_attr.attr, +#ifdef CONFIG_ZONE_DMA + &cache_dma_attr.attr, +#endif +#ifdef CONFIG_NUMA + &defrag_ratio_attr.attr, +#endif + NULL +}; + +static struct attribute_group slab_attr_group = { + .attrs = slab_attrs, +}; + +static ssize_t slab_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(s, buf); + + return err; +} + +static ssize_t slab_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(s, buf, len); + + return err; +} + +static struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, +}; + +static struct kobj_type slab_ktype = { + .sysfs_ops = &slab_sysfs_ops, +}; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &slab_ktype) + return 1; + return 0; +} + +static struct kset_uevent_ops slab_uevent_ops = { + .filter = uevent_filter, +}; + +decl_subsys(slab, &slab_ktype, &slab_uevent_ops); + +#define ID_STR_LENGTH 64 + +/* Create a unique string id for a slab cache: + * format + * :[flags-]size:[memory address of kmemcache] + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + BUG_ON(!name); + + *p++ = ':'; + /* + * First flags affecting slabcache operations. We will only + * get here for aliasable slabs so we do not need to support + * too many flags. The flags here must cover all flags that + * are matched during merging to guarantee that the id is + * unique. + */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); + BUG_ON(p > name + ID_STR_LENGTH - 1); + return name; +} + +static int sysfs_slab_add(struct kmem_cache *s) +{ + int err; + const char *name; + int unmergeable; + + if (slab_state < SYSFS) + /* Defer until later */ + return 0; + + unmergeable = slab_unmergeable(s); + if (unmergeable) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_subsys.kset.kobj, s->name); + name = s->name; + } else { + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + } + + kobj_set_kset_s(s, slab_subsys); + kobject_set_name(&s->kobj, name); + kobject_init(&s->kobj); + err = kobject_add(&s->kobj); + if (err) + return err; + + err = sysfs_create_group(&s->kobj, &slab_attr_group); + if (err) + return err; + kobject_uevent(&s->kobj, KOBJ_ADD); + if (!unmergeable) { + /* Setup first alias */ + sysfs_slab_alias(s, s->name); + kfree(name); + } + return 0; +} + +static void sysfs_slab_remove(struct kmem_cache *s) +{ + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); +} + +/* + * Need to buffer aliases during bootup until sysfs becomes + * available lest we loose that information. + */ +struct saved_alias { + struct kmem_cache *s; + const char *name; + struct saved_alias *next; +}; + +struct saved_alias *alias_list; + +static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +{ + struct saved_alias *al; + + if (slab_state == SYSFS) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_subsys.kset.kobj, name); + return sysfs_create_link(&slab_subsys.kset.kobj, + &s->kobj, name); + } + + al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); + if (!al) + return -ENOMEM; + + al->s = s; + al->name = name; + al->next = alias_list; + alias_list = al; + return 0; +} + +static int __init slab_sysfs_init(void) +{ + int err; + + err = subsystem_register(&slab_subsys); + if (err) { + printk(KERN_ERR "Cannot register slab subsystem.\n"); + return -ENOSYS; + } + + finish_bootstrap(); + + while (alias_list) { + struct saved_alias *al = alias_list; + + alias_list = alias_list->next; + err = sysfs_slab_alias(al->s, al->name); + BUG_ON(err); + kfree(al); + } + + resiliency_test(); + return 0; +} + +__initcall(slab_sysfs_init); +#else +__initcall(finish_bootstrap); +#endif diff --git a/mm/swap.c b/mm/swap.c index 2ed7be39795..218c52a24a2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -55,7 +55,7 @@ static void fastcall __page_cache_release(struct page *page) static void put_compound_page(struct page *page) { - page = (struct page *)page_private(page); + page = compound_head(page); if (put_page_testzero(page)) { compound_page_dtor *dtor; diff --git a/mm/swapfile.c b/mm/swapfile.c index a2d9bb4e80d..acc172cbe3a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1531,9 +1531,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) error = PTR_ERR(page); goto bad_swap; } - wait_on_page_locked(page); - if (!PageUptodate(page)) - goto bad_swap; kmap(page); swap_header = page_address(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index db023e2ff38..56651a10c36 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1323,8 +1323,6 @@ static int kswapd(void *p) for ( ; ; ) { unsigned long new_order; - try_to_freeze(); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; @@ -1335,12 +1333,19 @@ static int kswapd(void *p) */ order = new_order; } else { - schedule(); + if (!freezing(current)) + schedule(); + order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, order); + if (!try_to_freeze()) { + /* We can speed up thawing tasks if we don't call + * balance_pgdat after returning from the refrigerator + */ + balance_pgdat(pgdat, order); + } } return 0; } |