aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2007-05-08 13:37:51 +1000
committerPaul Mackerras <paulus@samba.org>2007-05-08 13:37:51 +1000
commit02bbc0f09c90cefdb2837605c96a66c5ce4ba2e1 (patch)
tree04ef573cd4de095c500c9fc3477f4278c0b36300 /mm
parent7487a2245b8841c77ba9db406cf99a483b9334e9 (diff)
parent5b94f675f57e4ff16c8fda09088d7480a84dcd91 (diff)
Merge branch 'linux-2.6'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile3
-rw-r--r--mm/filemap.c85
-rw-r--r--mm/highmem.c9
-rw-r--r--mm/internal.h2
-rw-r--r--mm/madvise.c33
-rw-r--r--mm/memory.c106
-rw-r--r--mm/mmap.c47
-rw-r--r--mm/oom_kill.c17
-rw-r--r--mm/page-writeback.c50
-rw-r--r--mm/page_alloc.c50
-rw-r--r--mm/quicklist.c88
-rw-r--r--mm/readahead.c29
-rw-r--r--mm/rmap.c3
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/slab.c200
-rw-r--r--mm/slob.c57
-rw-r--r--mm/slub.c3520
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/vmalloc.c14
-rw-r--r--mm/vmscan.c13
23 files changed, 4100 insertions, 241 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 7942b333e46..1ac718f636e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -163,3 +163,8 @@ config ZONE_DMA_FLAG
default "0" if !ZONE_DMA
default "1"
+config NR_QUICK
+ int
+ depends on QUICKLIST
+ default "1"
+
diff --git a/mm/Makefile b/mm/Makefile
index f3c077eb0b8..a9148ea329a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -25,7 +25,10 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_SLAB) += slab.o
+obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
+obj-$(CONFIG_QUICKLIST) += quicklist.o
+
diff --git a/mm/filemap.c b/mm/filemap.c
index 5dfc093ceb3..5631d6b2a62 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -868,6 +868,7 @@ void do_generic_mapping_read(struct address_space *mapping,
unsigned long last_index;
unsigned long next_index;
unsigned long prev_index;
+ unsigned int prev_offset;
loff_t isize;
struct page *cached_page;
int error;
@@ -876,7 +877,8 @@ void do_generic_mapping_read(struct address_space *mapping,
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
next_index = index;
- prev_index = ra.prev_page;
+ prev_index = ra.prev_index;
+ prev_offset = ra.prev_offset;
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
@@ -924,10 +926,10 @@ page_ok:
flush_dcache_page(page);
/*
- * When (part of) the same page is read multiple times
- * in succession, only mark it as accessed the first time.
+ * When a sequential read accesses a page several times,
+ * only mark it as accessed the first time.
*/
- if (prev_index != index)
+ if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
@@ -945,6 +947,8 @@ page_ok:
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
+ prev_offset = offset;
+ ra.prev_offset = offset;
page_cache_release(page);
if (ret == nr && desc->count)
@@ -1446,30 +1450,6 @@ page_not_uptodate:
majmin = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
- lock_page(page);
-
- /* Did it get unhashed while we waited for it? */
- if (!page->mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto retry_all;
- }
-
- /* Did somebody else get it up-to-date? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto success;
- }
-
- error = mapping->a_ops->readpage(file, page);
- if (!error) {
- wait_on_page_locked(page);
- if (PageUptodate(page))
- goto success;
- } else if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry_find;
- }
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -1726,7 +1706,7 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
-static inline struct page *__read_cache_page(struct address_space *mapping,
+static struct page *__read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
@@ -1763,17 +1743,11 @@ repeat:
return page;
}
-/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: destination for read data
- *
- * Read into the page cache. If a page already exists,
- * and PageUptodate() is not set, try to fill the page.
+/*
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
*/
-struct page *read_cache_page(struct address_space *mapping,
+struct page *read_cache_page_async(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
@@ -1805,6 +1779,39 @@ retry:
page = ERR_PTR(err);
}
out:
+ mark_page_accessed(page);
+ return page;
+}
+EXPORT_SYMBOL(read_cache_page_async);
+
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page then wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page(struct address_space *mapping,
+ unsigned long index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ struct page *page;
+
+ page = read_cache_page_async(mapping, index, filler, data);
+ if (IS_ERR(page))
+ goto out;
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ out:
return page;
}
EXPORT_SYMBOL(read_cache_page);
diff --git a/mm/highmem.c b/mm/highmem.c
index 51e1c1995fe..be8f8d36a8b 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,6 +99,15 @@ static void flush_all_zero_pkmaps(void)
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
+/* Flush all unused kmap mappings in order to remove stray
+ mappings. */
+void kmap_flush_unused(void)
+{
+ spin_lock(&kmap_lock);
+ flush_all_zero_pkmaps();
+ spin_unlock(&kmap_lock);
+}
+
static inline unsigned long map_new_virtual(struct page *page)
{
unsigned long vaddr;
diff --git a/mm/internal.h b/mm/internal.h
index d527b80b292..a3110c02aea 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+ VM_BUG_ON(PageCompound(page) && PageTail(page));
VM_BUG_ON(atomic_read(&page->_count));
set_page_count(page, 1);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 603c5257ed6..e75096b5a6d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -12,6 +12,24 @@
#include <linux/hugetlb.h>
/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_sem for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static int madvise_need_mmap_write(int behavior)
+{
+ switch (behavior) {
+ case MADV_REMOVE:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ return 0;
+ default:
+ /* be safe, default to 1. list exceptions explicitly */
+ return 1;
+ }
+}
+
+/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
@@ -183,9 +201,9 @@ static long madvise_remove(struct vm_area_struct *vma,
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/* vmtruncate_range needs to take i_mutex and i_alloc_sem */
- up_write(&current->mm->mmap_sem);
+ up_read(&current->mm->mmap_sem);
error = vmtruncate_range(mapping->host, offset, endoff);
- down_write(&current->mm->mmap_sem);
+ down_read(&current->mm->mmap_sem);
return error;
}
@@ -270,7 +288,10 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
int error = -EINVAL;
size_t len;
- down_write(&current->mm->mmap_sem);
+ if (madvise_need_mmap_write(behavior))
+ down_write(&current->mm->mmap_sem);
+ else
+ down_read(&current->mm->mmap_sem);
if (start & ~PAGE_MASK)
goto out;
@@ -332,6 +353,10 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
vma = find_vma(current->mm, start);
}
out:
- up_write(&current->mm->mmap_sem);
+ if (madvise_need_mmap_write(behavior))
+ up_write(&current->mm->mmap_sem);
+ else
+ up_read(&current->mm->mmap_sem);
+
return error;
}
diff --git a/mm/memory.c b/mm/memory.c
index e7066e71dfa..1d647ab0ee7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1448,6 +1448,100 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(remap_pfn_range);
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pte_t *pte;
+ int err;
+ struct page *pmd_page;
+ spinlock_t *uninitialized_var(ptl);
+
+ pte = (mm == &init_mm) ?
+ pte_alloc_kernel(pmd, addr) :
+ pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+
+ BUG_ON(pmd_huge(*pmd));
+
+ pmd_page = pmd_page(*pmd);
+
+ do {
+ err = fn(pte, pmd_page, addr, data);
+ if (err)
+ break;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ if (mm != &init_mm)
+ pte_unmap_unlock(pte-1, ptl);
+ return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ int err;
+
+ pmd = pmd_alloc(mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pmd++, addr = next, addr != end);
+ return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ pte_fn_t fn, void *data)
+{
+ pud_t *pud;
+ unsigned long next;
+ int err;
+
+ pud = pud_alloc(mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pud++, addr = next, addr != end);
+ return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ unsigned long size, pte_fn_t fn, void *data)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + size;
+ int err;
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+ return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
/*
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
@@ -2539,12 +2633,6 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
spin_unlock(&mm->page_table_lock);
return 0;
}
-#else
-/* Workaround for gcc 2.96 */
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
-{
- return 0;
-}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
@@ -2573,12 +2661,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
spin_unlock(&mm->page_table_lock);
return 0;
}
-#else
-/* Workaround for gcc 2.96 */
-int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
-{
- return 0;
-}
#endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present(unsigned long addr, unsigned long end)
diff --git a/mm/mmap.c b/mm/mmap.c
index 84f997da78d..52646d61ff6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#include <asm/mmu_context.h>
#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags) (0)
@@ -1199,6 +1200,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (len > TASK_SIZE)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
@@ -1272,6 +1276,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (len > TASK_SIZE)
return -ENOMEM;
+ if (flags & MAP_FIXED)
+ return addr;
+
/* requesting a specific address */
if (addr) {
addr = PAGE_ALIGN(addr);
@@ -1360,38 +1367,21 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
unsigned long ret;
-
- if (!(flags & MAP_FIXED)) {
- unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-
- get_area = current->mm->get_unmapped_area;
- if (file && file->f_op && file->f_op->get_unmapped_area)
- get_area = file->f_op->get_unmapped_area;
- addr = get_area(file, addr, len, pgoff, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
- }
+ unsigned long (*get_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+ get_area = current->mm->get_unmapped_area;
+ if (file && file->f_op && file->f_op->get_unmapped_area)
+ get_area = file->f_op->get_unmapped_area;
+ addr = get_area(file, addr, len, pgoff, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (addr & ~PAGE_MASK)
return -EINVAL;
- if (file && is_file_hugepages(file)) {
- /*
- * Check if the given range is hugepage aligned, and
- * can be made suitable for hugepages.
- */
- ret = prepare_hugepage_range(addr, len, pgoff);
- } else {
- /*
- * Ensure that a normal request is not falling in a
- * reserved hugepage range. For some archs like IA-64,
- * there is a separate region for hugepages.
- */
- ret = is_hugepage_only_range(current->mm, addr, len);
- }
- if (ret)
- return -EINVAL;
+
return addr;
}
@@ -1979,6 +1969,9 @@ void exit_mmap(struct mm_struct *mm)
unsigned long nr_accounted = 0;
unsigned long end;
+ /* mm's last user has gone, and its about to be pulled down */
+ arch_exit_mmap(mm);
+
lru_add_drain();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3791edfffee..a7001410ab1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -147,9 +147,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
* Adjust the score by oomkilladj.
*/
if (p->oomkilladj) {
- if (p->oomkilladj > 0)
+ if (p->oomkilladj > 0) {
+ if (!points)
+ points = 1;
points <<= p->oomkilladj;
- else
+ } else
points >>= -(p->oomkilladj);
}
@@ -397,6 +399,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
struct task_struct *p;
unsigned long points = 0;
unsigned long freed = 0;
+ int constraint;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
@@ -411,14 +414,18 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
show_mem();
}
- cpuset_lock();
- read_lock(&tasklist_lock);
+ if (sysctl_panic_on_oom == 2)
+ panic("out of memory. Compulsory panic_on_oom is selected.\n");
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- switch (constrained_alloc(zonelist, gfp_mask)) {
+ constraint = constrained_alloc(zonelist, gfp_mask);
+ cpuset_lock();
+ read_lock(&tasklist_lock);
+
+ switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
oom_kill_process(current, points,
"No available memory (MPOL_BIND)");
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a794945fd19..029dfad5a23 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -119,6 +119,44 @@ static void background_writeout(unsigned long _min_pages);
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+ int node;
+ unsigned long x = 0;
+
+ for_each_online_node(node) {
+ struct zone *z =
+ &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+ x += zone_page_state(z, NR_FREE_PAGES)
+ + zone_page_state(z, NR_INACTIVE)
+ + zone_page_state(z, NR_ACTIVE);
+ }
+ /*
+ * Make sure that the number of highmem pages is never larger
+ * than the number of the total dirtyable memory. This can only
+ * occur in very strange VM situations but we want to make sure
+ * that this does not occur.
+ */
+ return min(x, total);
+#else
+ return 0;
+#endif
+}
+
+static unsigned long determine_dirtyable_memory(void)
+{
+ unsigned long x;
+
+ x = global_page_state(NR_FREE_PAGES)
+ + global_page_state(NR_INACTIVE)
+ + global_page_state(NR_ACTIVE);
+ x -= highmem_dirtyable_memory(x);
+ return x + 1; /* Ensure that we never return 0 */
+}
+
static void
get_dirty_limits(long *pbackground, long *pdirty,
struct address_space *mapping)
@@ -128,20 +166,12 @@ get_dirty_limits(long *pbackground, long *pdirty,
int unmapped_ratio;
long background;
long dirty;
- unsigned long available_memory = vm_total_pages;
+ unsigned long available_memory = determine_dirtyable_memory();
struct task_struct *tsk;
-#ifdef CONFIG_HIGHMEM
- /*
- * We always exclude high memory from our count.
- */
- available_memory -= totalhigh_pages;
-#endif
-
-
unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
global_page_state(NR_ANON_PAGES)) * 100) /
- vm_total_pages;
+ available_memory;
dirty_ratio = vm_dirty_ratio;
if (dirty_ratio > unmapped_ratio / 2)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 353ce9039a8..59164313167 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -156,10 +156,8 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
static int page_is_consistent(struct zone *zone, struct page *page)
{
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(page)))
+ if (!pfn_valid_within(page_to_pfn(page)))
return 0;
-#endif
if (zone != page_zone(page))
return 0;
@@ -227,7 +225,7 @@ static void bad_page(struct page *page)
static void free_compound_page(struct page *page)
{
- __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+ __free_pages_ok(page, compound_order(page));
}
static void prep_compound_page(struct page *page, unsigned long order)
@@ -236,12 +234,13 @@ static void prep_compound_page(struct page *page, unsigned long order)
int nr_pages = 1 << order;
set_compound_page_dtor(page, free_compound_page);
- page[1].lru.prev = (void *)order;
- for (i = 0; i < nr_pages; i++) {
+ set_compound_order(page, order);
+ __SetPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- __SetPageCompound(p);
- set_page_private(p, (unsigned long)page);
+ __SetPageTail(p);
+ p->first_page = page;
}
}
@@ -250,16 +249,19 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (unlikely((unsigned long)page[1].lru.prev != order))
+ if (unlikely(compound_order(page) != order))
bad_page(page);
- for (i = 0; i < nr_pages; i++) {
+ if (unlikely(!PageHead(page)))
+ bad_page(page);
+ __ClearPageHead(page);
+ for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageCompound(p) |
- (page_private(p) != (unsigned long)page)))
+ if (unlikely(!PageTail(p) |
+ (p->first_page != page)))
bad_page(page);
- __ClearPageCompound(p);
+ __ClearPageTail(p);
}
}
@@ -346,10 +348,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
static inline int page_is_buddy(struct page *page, struct page *buddy,
int order)
{
-#ifdef CONFIG_HOLES_IN_ZONE
- if (!pfn_valid(page_to_pfn(buddy)))
+ if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
-#endif
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
@@ -433,13 +433,18 @@ static inline int free_pages_check(struct page *page)
1 << PG_private |
1 << PG_locked |
1 << PG_active |
- 1 << PG_reclaim |
1 << PG_slab |
1 << PG_swapcache |
1 << PG_writeback |
1 << PG_reserved |
1 << PG_buddy ))))
bad_page(page);
+ /*
+ * PageReclaim == PageTail. It is only an error
+ * for PageReclaim to be set if PageCompound is clear.
+ */
+ if (unlikely(!PageCompound(page) && PageReclaim(page)))
+ bad_page(page);
if (PageDirty(page))
__ClearPageDirty(page);
/*
@@ -665,7 +670,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
}
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly;
+int nr_node_ids __read_mostly = MAX_NUMNODES;
EXPORT_SYMBOL(nr_node_ids);
/*
@@ -770,8 +775,8 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
- if (!PageNosave(page))
- ClearPageNosaveFree(page);
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
}
for (order = MAX_ORDER - 1; order >= 0; --order)
@@ -780,7 +785,7 @@ void mark_free_pages(struct zone *zone)
pfn = page_to_pfn(list_entry(curr, struct page, lru));
for (i = 0; i < (1UL << order); i++)
- SetPageNosaveFree(pfn_to_page(pfn + i));
+ swsusp_set_page_free(pfn_to_page(pfn + i));
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -3203,7 +3208,8 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, file, buffer, length, ppos);
- setup_per_zone_pages_min();
+ if (write)
+ setup_per_zone_pages_min();
return 0;
}
diff --git a/mm/quicklist.c b/mm/quicklist.c
new file mode 100644
index 00000000000..ae8189c2799
--- /dev/null
+++ b/mm/quicklist.c
@@ -0,0 +1,88 @@
+/*
+ * Quicklist support.
+ *
+ * Quicklists are light weight lists of pages that have a defined state
+ * on alloc and free. Pages must be in the quicklist specific defined state
+ * (zero by default) when the page is freed. It seems that the initial idea
+ * for such lists first came from Dave Miller and then various other people
+ * improved on it.
+ *
+ * Copyright (C) 2007 SGI,
+ * Christoph Lameter <clameter@sgi.com>
+ * Generalized, added support for multiple lists and
+ * constructors / destructors.
+ */
+#include <linux/kernel.h>
+
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/quicklist.h>
+
+DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
+
+#define FRACTION_OF_NODE_MEM 16
+
+static unsigned long max_pages(unsigned long min_pages)
+{
+ unsigned long node_free_pages, max;
+
+ node_free_pages = node_page_state(numa_node_id(),
+ NR_FREE_PAGES);
+ max = node_free_pages / FRACTION_OF_NODE_MEM;
+ return max(max, min_pages);
+}
+
+static long min_pages_to_free(struct quicklist *q,
+ unsigned long min_pages, long max_free)
+{
+ long pages_to_free;
+
+ pages_to_free = q->nr_pages - max_pages(min_pages);
+
+ return min(pages_to_free, max_free);
+}
+
+/*
+ * Trim down the number of pages in the quicklist
+ */
+void quicklist_trim(int nr, void (*dtor)(void *),
+ unsigned long min_pages, unsigned long max_free)
+{
+ long pages_to_free;
+ struct quicklist *q;
+
+ q = &get_cpu_var(quicklist)[nr];
+ if (q->nr_pages > min_pages) {
+ pages_to_free = min_pages_to_free(q, min_pages, max_free);
+
+ while (pages_to_free > 0) {
+ /*
+ * We pass a gfp_t of 0 to quicklist_alloc here
+ * because we will never call into the page allocator.
+ */
+ void *p = quicklist_alloc(nr, 0, NULL);
+
+ if (dtor)
+ dtor(p);
+ free_page((unsigned long)p);
+ pages_to_free--;
+ }
+ }
+ put_cpu_var(quicklist);
+}
+
+unsigned long quicklist_total_size(void)
+{
+ unsigned long count = 0;
+ int cpu;
+ struct quicklist *ql, *q;
+
+ for_each_online_cpu(cpu) {
+ ql = per_cpu(quicklist, cpu);
+ for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
+ count += q->nr_pages;
+ }
+ return count;
+}
+
diff --git a/mm/readahead.c b/mm/readahead.c
index 93d9ee692fd..9861e883fe5 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -37,7 +37,7 @@ void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
ra->ra_pages = mapping->backing_dev_info->ra_pages;
- ra->prev_page = -1;
+ ra->prev_index = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -202,17 +202,19 @@ out:
* size: Number of pages in that read
* Together, these form the "current window".
* Together, start and size represent the `readahead window'.
- * prev_page: The page which the readahead algorithm most-recently inspected.
+ * prev_index: The page which the readahead algorithm most-recently inspected.
* It is mainly used to detect sequential file reading.
* If page_cache_readahead sees that it is again being called for
* a page which it just looked at, it can return immediately without
* making any state changes.
+ * offset: Offset in the prev_index where the last read ended - used for
+ * detection of sequential file reading.
* ahead_start,
* ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
*
* When readahead is in the off state (size == 0), readahead is disabled.
- * In this state, prev_page is used to detect the resumption of sequential I/O.
+ * In this state, prev_index is used to detect the resumption of sequential I/O.
*
* The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages
@@ -415,7 +417,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
ra->ahead_size = get_next_ra_size(ra);
ra->ahead_start = ra->start + ra->size;
- block = force || (ra->prev_page >= ra->ahead_start);
+ block = force || (ra->prev_index >= ra->ahead_start);
ret = blockable_page_cache_readahead(mapping, filp,
ra->ahead_start, ra->ahead_size, ra, block);
@@ -467,12 +469,13 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
* We avoid doing extra work and bogusly perturbing the readahead
* window expansion logic.
*/
- if (offset == ra->prev_page && --req_size)
+ if (offset == ra->prev_index && --req_size)
++offset;
- /* Note that prev_page == -1 if it is a first read */
- sequential = (offset == ra->prev_page + 1);
- ra->prev_page = offset;
+ /* Note that prev_index == -1 if it is a first read */
+ sequential = (offset == ra->prev_index + 1);
+ ra->prev_index = offset;
+ ra->prev_offset = 0;
max = get_max_readahead(ra);
newsize = min(req_size, max);
@@ -481,7 +484,7 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))
goto out;
- ra->prev_page += newsize - 1;
+ ra->prev_index += newsize - 1;
/*
* Special case - first read at start of file. We'll assume it's
@@ -537,18 +540,18 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
* we get called back on the first page of the ahead window which
* will allow us to submit more IO.
*/
- if (ra->prev_page >= ra->ahead_start) {
+ if (ra->prev_index >= ra->ahead_start) {
ra->start = ra->ahead_start;
ra->size = ra->ahead_size;
make_ahead_window(mapping, filp, ra, 0);
recheck:
- /* prev_page shouldn't overrun the ahead window */
- ra->prev_page = min(ra->prev_page,
+ /* prev_index shouldn't overrun the ahead window */
+ ra->prev_index = min(ra->prev_index,
ra->ahead_start + ra->ahead_size - 1);
}
out:
- return ra->prev_page + 1;
+ return ra->prev_index + 1;
}
EXPORT_SYM