diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/bootmem.c | 37 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 26 | ||||
-rw-r--r-- | mm/filemap_xip.c | 65 | ||||
-rw-r--r-- | mm/highmem.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 62 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/mm_init.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 24 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 24 | ||||
-rw-r--r-- | mm/page_isolation.c | 13 | ||||
-rw-r--r-- | mm/quicklist.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 39 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 1 | ||||
-rw-r--r-- | mm/slob.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 32 | ||||
-rw-r--r-- | mm/sparse.c | 1 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 26 | ||||
-rw-r--r-- | mm/truncate.c | 4 | ||||
-rw-r--r-- | mm/util.c | 15 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmstat.c | 19 |
28 files changed, 340 insertions, 123 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 446c6588c75..91ee3922510 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -config HAVE_GET_USER_PAGES_FAST - bool - # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows @@ -190,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA diff --git a/mm/bootmem.c b/mm/bootmem.c index 4af15d0340a..ad8eec6e44a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -405,6 +405,29 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ +static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) +{ + unsigned long base = bdata->node_min_pfn; + + /* + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. + */ + + return ALIGN(base + idx, step) - base; +} + +static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, + unsigned long align) +{ + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; +} + static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -441,7 +464,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, else start = ALIGN(min, step); - sidx = start - bdata->node_min_pfn;; + sidx = start - bdata->node_min_pfn; midx = max - bdata->node_min_pfn; if (bdata->hint_idx > sidx) { @@ -450,7 +473,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, * catch the fallback below. */ fallback = sidx + 1; - sidx = ALIGN(bdata->hint_idx, step); + sidx = align_idx(bdata, bdata->hint_idx, step); } while (1) { @@ -459,7 +482,7 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long eidx, i, start_off, end_off; find_block: sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); - sidx = ALIGN(sidx, step); + sidx = align_idx(bdata, sidx, step); eidx = sidx + PFN_UP(size); if (sidx >= midx || eidx > midx) @@ -467,15 +490,15 @@ find_block: for (i = sidx; i < eidx; i++) if (test_bit(i, bdata->node_bootmem_map)) { - sidx = ALIGN(i, step); + sidx = align_idx(bdata, i, step); if (sidx == i) sidx += step; goto find_block; } - if (bdata->last_end_off && + if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) - start_off = ALIGN(bdata->last_end_off, align); + start_off = align_off(bdata, bdata->last_end_off, align); else start_off = PFN_PHYS(sidx); @@ -499,7 +522,7 @@ find_block: } if (fallback) { - sidx = ALIGN(fallback - 1, step); + sidx = align_idx(bdata, fallback - 1, step); fallback = 0; goto find_block; } diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019..06722c40305 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * Data-less bio, nothing to bounce */ - if (bio_empty_barrier(*bio_orig)) + if (!bio_has_data(*bio_orig)) return; /* diff --git a/mm/filemap.c b/mm/filemap.c index 54e96865085..494ff20b6cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1100,8 +1100,9 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ @@ -1130,8 +1131,9 @@ readpage: } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1143,15 +1145,14 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -2129,13 +2130,20 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). + * without clobbering -EIOCBQUEUED from ->direct_IO(). */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, pos >> PAGE_CACHE_SHIFT, end); - if (written) + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; goto out; + } } written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 380ab402d71..b5167dfb2f2 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,6 +15,8 @@ #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -22,22 +24,18 @@ * We do use our own empty page to avoid interference with other users * of ZERO_PAGE(), such as /dev/zero */ +static DEFINE_MUTEX(xip_sparse_mutex); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; static struct page *__xip_sparse_page; +/* called under xip_sparse_mutex */ static struct page *xip_sparse_page(void) { if (!__xip_sparse_page) { struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - if (page) { - static DEFINE_SPINLOCK(xip_alloc_lock); - spin_lock(&xip_alloc_lock); - if (!__xip_sparse_page) - __xip_sparse_page = page; - else - __free_page(page); - spin_unlock(&xip_alloc_lock); - } + if (page) + __xip_sparse_page = page; } return __xip_sparse_page; } @@ -174,18 +172,23 @@ __xip_unmap (struct address_space * mapping, pte_t pteval; spinlock_t *ptl; struct page *page; + unsigned count; + int locked = 0; + + count = read_seqcount_begin(&xip_sparse_seq); page = __xip_sparse_page; if (!page) return; +retry: spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); @@ -198,6 +201,14 @@ __xip_unmap (struct address_space * mapping, } } spin_unlock(&mapping->i_mmap_lock); + + if (locked) { + mutex_unlock(&xip_sparse_mutex); + } else if (read_seqcount_retry(&xip_sparse_seq, count)) { + mutex_lock(&xip_sparse_mutex); + locked = 1; + goto retry; + } } /* @@ -218,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; /* XXX: are VM_FAULT_ codes OK? */ - +again: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -237,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int err; /* maybe shared writable, allocate new block */ + mutex_lock(&xip_sparse_mutex); error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (error) return VM_FAULT_SIGBUS; /* unmap sparse mappings at pgoff from all other vmas */ @@ -252,14 +265,34 @@ found: BUG_ON(err); return VM_FAULT_NOPAGE; } else { + int err, ret = VM_FAULT_OOM; + + mutex_lock(&xip_sparse_mutex); + write_seqcount_begin(&xip_sparse_seq); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (unlikely(!error)) { + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + goto again; + } + if (error != -ENODATA) + goto out; /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return VM_FAULT_OOM; + goto out; + err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, + page); + if (err == -ENOMEM) + goto out; - page_cache_get(page); - vmf->page = page; - return 0; + ret = VM_FAULT_NOPAGE; +out: + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + + return ret; } } @@ -308,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, &xip_mem, &xip_pfn); if (status == -ENODATA) { /* we allocate a new page unmap it */ + mutex_lock(&xip_sparse_mutex); status = a_ops->get_xip_mem(mapping, index, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); diff --git a/mm/highmem.c b/mm/highmem.c index e16e1523b68..b36b83b920f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) { int i; + int need_flush = 0; flush_cache_kmaps(); @@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void) &pkmap_page_table[i]); set_page_address(page, NULL); + need_flush = 1; } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } /** diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd9..67a71191136 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* @@ -1937,6 +1942,18 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1962,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -1973,6 +1991,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -1989,25 +2008,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7056c3bdb47..36896f3eb7f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } @@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + kmem_cache_free(page_cgroup_cache, pc); + return 0; + } /* * For every charge from the cgroup, increment reference count */ @@ -796,14 +809,21 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) if (mem_cgroup_subsys.disabled) return 0; + if (!mm) + return 0; rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } css_get(&mem->css); rcu_read_unlock(); do { progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); } while (!progress && --retry); css_put(&mem->css); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec2058..83369058ec1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; diff --git a/mm/mm_init.c b/mm/mm_init.c index 936ef2efd89..4e0e26591df 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -12,7 +12,7 @@ #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT -int __meminitdata mminit_loglevel; +int mminit_loglevel; #ifndef SECTIONS_SHIFT #define SECTIONS_SHIFT 0 diff --git a/mm/mmap.c b/mm/mmap.c index 971d0eda754..e7a5a68a9c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1030,6 +1030,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: @@ -2273,14 +2277,14 @@ int install_special_mapping(struct mm_struct *mm, static DEFINE_MUTEX(mm_all_locks_mutex); -static void vm_lock_anon_vma(struct anon_vma *anon_vma) +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock(&anon_vma->lock); + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->lock. If some other vma in this mm shares @@ -2296,7 +2300,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma) } } -static void vm_lock_mapping(struct address_space *mapping) +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* @@ -2310,7 +2314,7 @@ static void vm_lock_mapping(struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock(&mapping->i_mmap_lock); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); } } @@ -2358,11 +2362,17 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->anon_vma) - vm_lock_anon_vma(vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) - vm_lock_mapping(vma->vm_file->f_mapping); + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); } + ret = 0; out_unlock: diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6..16ce8b955dc 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee626..64e5b4bcd96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 401d104d2bb..27b8681139f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -268,13 +268,14 @@ void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); __SetPageTail(p); p->first_page = page; } @@ -284,6 +285,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -291,8 +293,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); if (unlikely(!PageTail(p) | (p->first_page != page))) @@ -694,6 +697,9 @@ static int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; @@ -2516,6 +2522,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; page = pfn_to_page(pfn); + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + /* Blocks with reserved pages will never free, skip them. */ if (PageReserved(page)) continue; @@ -4064,7 +4074,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4437,7 +4447,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58033c..b70a7fec1ff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -2,7 +2,6 @@ * linux/mm/page_isolation.c */ -#include <stddef.h> #include <linux/mm.h> #include <linux/page-isolation.h> #include <linux/pageblock-flags.h> @@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, flags; struct page *page; + struct zone *zone; + int ret; pfn = start_pfn; /* @@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (pfn < end_pfn) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) - return 0; - return -EBUSY; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; } diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb39..8dbb6805ef3 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + return max(max, min_pages); } diff --git a/mm/rmap.c b/mm/rmap.c index 1ea4e6fcee7..0383acfcb06 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -224,10 +224,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp) + unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; pud_t *pud; @@ -249,7 +253,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { + if (!sync && !pte_present(*pte)) { pte_unmap(pte); return NULL; } @@ -281,7 +285,7 @@ static int page_referenced_one(struct page *page, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -450,7 +454,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -659,23 +663,30 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) } /* - * It would be tidy to reset the PageAnon mapping h |