diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/bootmem.c | 196 | ||||
-rw-r--r-- | mm/dmapool.c | 12 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/filemap_xip.c | 200 | ||||
-rw-r--r-- | mm/hugetlb.c | 78 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 228 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 186 | ||||
-rw-r--r-- | mm/mempolicy.c | 1051 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 33 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 58 | ||||
-rw-r--r-- | mm/page_alloc.c | 274 | ||||
-rw-r--r-- | mm/pagewalk.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 144 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slub.c | 18 | ||||
-rw-r--r-- | mm/sparse.c | 145 | ||||
-rw-r--r-- | mm/swap.c | 37 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/truncate.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 141 | ||||
-rw-r--r-- | mm/vmscan.c | 46 | ||||
-rw-r--r-- | mm/vmstat.c | 11 |
30 files changed, 1919 insertions, 1058 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0016ebd4dcb..3aa819d628c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +# +# If we have space for more page flags then we can enable additional +# optimizations and functionality. +# +# Regular Sparsemem takes page flag bits for the sectionid if it does not +# use a virtual memmap. Disable extended page flags for 32 bit platforms +# that require the use of a sectionid in the page flags. +# +config PAGEFLAGS_EXTENDED + def_bool y + depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/bootmem.c b/mm/bootmem.c index 2ccea700968..e8fb927392b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, * might be used for boot-time allocations - or it might get added * to the free page pool later on. */ -static int __init reserve_bootmem_core(bootmem_data_t *bdata, +static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size, int flags) { unsigned long sidx, eidx; unsigned long i; - int ret; + + BUG_ON(!size); + + /* out of range, don't hold other */ + if (addr + size < bdata->node_boot_start || + PFN_DOWN(addr) > bdata->node_low_pfn) + return 0; /* - * round up, partially reserved pages are considered - * fully reserved. + * Round up to index to the range. */ + if (addr > bdata->node_boot_start) + sidx= PFN_DOWN(addr - bdata->node_boot_start); + else + sidx = 0; + + eidx = PFN_UP(addr + size - bdata->node_boot_start); + if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) + eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + + for (i = sidx; i < eidx; i++) { + if (test_bit(i, bdata->node_bootmem_map)) { + if (flags & BOOTMEM_EXCLUSIVE) + return -EBUSY; + } + } + + return 0; + +} + +static void __init reserve_bootmem_core(bootmem_data_t *bdata, + unsigned long addr, unsigned long size, int flags) +{ + unsigned long sidx, eidx; + unsigned long i; + BUG_ON(!size); - BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); - BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); - BUG_ON(addr < bdata->node_boot_start); - sidx = PFN_DOWN(addr - bdata->node_boot_start); + /* out of range */ + if (addr + size < bdata->node_boot_start || + PFN_DOWN(addr) > bdata->node_low_pfn) + return; + + /* + * Round up to index to the range. + */ + if (addr > bdata->node_boot_start) + sidx= PFN_DOWN(addr - bdata->node_boot_start); + else + sidx = 0; + eidx = PFN_UP(addr + size - bdata->node_boot_start); + if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) + eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); - for (i = sidx; i < eidx; i++) + for (i = sidx; i < eidx; i++) { if (test_and_set_bit(i, bdata->node_bootmem_map)) { #ifdef CONFIG_DEBUG_BOOTMEM printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); #endif - if (flags & BOOTMEM_EXCLUSIVE) { - ret = -EBUSY; - goto err; - } } - - return 0; - -err: - /* unreserve memory we accidentally reserved */ - for (i--; i >= sidx; i--) - clear_bit(i, bdata->node_bootmem_map); - - return ret; + } } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, @@ -206,9 +236,11 @@ void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { - unsigned long offset, remaining_size, areasize, preferred; + unsigned long areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; void *ret; + unsigned long node_boot_start; + void *node_bootmem_map; if (!size) { printk("__alloc_bootmem_core(): zero-sized request\n"); @@ -216,70 +248,83 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, } BUG_ON(align & (align-1)); - if (limit && bdata->node_boot_start >= limit) - return NULL; - /* on nodes without memory - bootmem_map is NULL */ if (!bdata->node_bootmem_map) return NULL; + /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ + node_boot_start = bdata->node_boot_start; + node_bootmem_map = bdata->node_bootmem_map; + if (align) { + node_boot_start = ALIGN(bdata->node_boot_start, align); + if (node_boot_start > bdata->node_boot_start) + node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + + PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; + } + + if (limit && node_boot_start >= limit) + return NULL; + end_pfn = bdata->node_low_pfn; limit = PFN_DOWN(limit); if (limit && end_pfn > limit) end_pfn = limit; - eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); - offset = 0; - if (align && (bdata->node_boot_start & (align - 1UL)) != 0) - offset = align - (bdata->node_boot_start & (align - 1UL)); - offset = PFN_DOWN(offset); + eidx = end_pfn - PFN_DOWN(node_boot_start); /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ - if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { - preferred = goal - bdata->node_boot_start; + preferred = 0; + if (goal && PFN_DOWN(goal) < end_pfn) { + if (goal > node_boot_start) + preferred = goal - node_boot_start; - if (bdata->last_success >= preferred) + if (bdata->last_success > node_boot_start && + bdata->last_success - node_boot_start >= preferred) if (!limit || (limit && limit > bdata->last_success)) - preferred = bdata->last_success; - } else - preferred = 0; + preferred = bdata->last_success - node_boot_start; + } - preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; + preferred = PFN_DOWN(ALIGN(preferred, align)); areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; incr = align >> PAGE_SHIFT ? : 1; restart_scan: - for (i = preferred; i < eidx; i += incr) { + for (i = preferred; i < eidx;) { unsigned long j; - i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); + + i = find_next_zero_bit(node_bootmem_map, eidx, i); i = ALIGN(i, incr); if (i >= eidx) break; - if (test_bit(i, bdata->node_bootmem_map)) + if (test_bit(i, node_bootmem_map)) { + i += incr; continue; + } for (j = i + 1; j < i + areasize; ++j) { if (j >= eidx) goto fail_block; - if (test_bit(j, bdata->node_bootmem_map)) + if (test_bit(j, node_bootmem_map)) goto fail_block; } start = i; goto found; fail_block: i = ALIGN(j, incr); + if (i == j) + i += incr; } - if (preferred > offset) { - preferred = offset; + if (preferred > 0) { + preferred = 0; goto restart_scan; } return NULL; found: - bdata->last_success = PFN_PHYS(start); + bdata->last_success = PFN_PHYS(start) + node_boot_start; BUG_ON(start >= eidx); /* @@ -289,6 +334,7 @@ found: */ if (align < PAGE_SIZE && bdata->last_offset && bdata->last_pos+1 == start) { + unsigned long offset, remaining_size; offset = ALIGN(bdata->last_offset, align); BUG_ON(offset > PAGE_SIZE); remaining_size = PAGE_SIZE - offset; @@ -297,14 +343,12 @@ found: /* last_pos unchanged */ bdata->last_offset = offset + size; ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + - bdata->node_boot_start); + offset + node_boot_start); } else { remaining_size = size - remaining_size; areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + - bdata->node_boot_start); + offset + node_boot_start); bdata->last_pos = start + areasize - 1; bdata->last_offset = remaining_size; } @@ -312,14 +356,14 @@ found: } else { bdata->last_pos = start + areasize - 1; bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); + ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); } /* * Reserve the area now: */ for (i = start; i < start + areasize; i++) - if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) + if (unlikely(test_and_set_bit(i, node_bootmem_map))) BUG(); memset(ret, 0, size); return ret; @@ -401,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { + int ret; + + ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + if (ret < 0) + return; reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); } @@ -412,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { + register_page_bootmem_info_node(pgdat); return free_all_bootmem_core(pgdat); } @@ -426,7 +476,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { - return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags); + bootmem_data_t *bdata; + int ret; + + list_for_each_entry(bdata, &bdata_list, list) { + ret = can_reserve_bootmem_core(bdata, addr, size, flags); + if (ret < 0) + return ret; + } + list_for_each_entry(bdata, &bdata_list, list) + reserve_bootmem_core(bdata, addr, size, flags); + + return 0; } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ @@ -484,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem(size, align, goal); } +#ifdef CONFIG_SPARSEMEM +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + void *ptr; + unsigned long limit, goal, start_nr, end_nr, pfn; + struct pglist_data *pgdat; + + pfn = section_nr_to_pfn(section_nr); + goal = PFN_PHYS(pfn); + limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; + pgdat = NODE_DATA(early_pfn_to_nid(pfn)); + ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, + limit); + + if (!ptr) + return NULL; + + start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); + end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); + if (start_nr != section_nr || end_nr != section_nr) { + printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", + section_nr); + free_bootmem_core(pgdat->bdata, __pa(ptr), size); + ptr = NULL; + } + + return ptr; +} +#endif + #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif diff --git a/mm/dmapool.c b/mm/dmapool.c index 34aaac451a9..b1f0885dda2 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -37,6 +37,10 @@ #include <linux/types.h> #include <linux/wait.h> +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#define DMAPOOL_DEBUG 1 +#endif + struct dma_pool { /* the pool */ struct list_head page_list; spinlock_t lock; @@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, &page->dma, mem_flags); if (page->vaddr) { -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif pool_initialise_page(pool, page); @@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) { dma_addr_t dma = page->dma; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); @@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, page->offset = *(int *)(page->vaddr + offset); retval = offset + page->vaddr; *handle = offset + page->dma; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif done: @@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } offset = vaddr - page->vaddr; -#ifdef CONFIG_DEBUG_SLAB +#ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { if (pool->dev) dev_err(pool->dev, diff --git a/mm/fadvise.c b/mm/fadvise.c index 3c0f1e99f5e..343cfdfebd9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) goto out; } - if (mapping->a_ops->get_xip_page) { + if (mapping->a_ops->get_xip_mem) { switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: diff --git a/mm/filemap.c b/mm/filemap.c index 07e9d9258b4..239d36163bb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); - } + if (TestClearPageReclaim(page)) + rotate_reclaimable_page(page); + + if (!test_clear_page_writeback(page)) + BUG(); + smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 5e598c42afd..3e744abcce9 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -15,6 +15,7 @@ #include <linux/rmap.h> #include <linux/sched.h> #include <asm/tlbflush.h> +#include <asm/io.h> /* * We do use our own empty page to avoid interference with other users @@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void) /* * This is a file read routine for execute in place files, and uses - * the mapping->a_ops->get_xip_page() function for the actual low-level + * the mapping->a_ops->get_xip_mem() function for the actual low-level * stuff. * * Note the struct file* is not used at all. It may be NULL. */ -static void +static ssize_t do_xip_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, - loff_t *ppos, - read_descriptor_t *desc, - read_actor_t actor) + char __user *buf, + size_t len, + loff_t *ppos) { struct inode *inode = mapping->host; pgoff_t index, end_index; unsigned long offset; - loff_t isize; + loff_t isize, pos; + size_t copied = 0, error = 0; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + pos = *ppos; + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; isize = i_size_read(inode); if (!isize) goto out; end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - for (;;) { - struct page *page; - unsigned long nr, ret; + do { + unsigned long nr, left; + void *xip_mem; + unsigned long xip_pfn; + int zero = 0; /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; @@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping, } } nr = nr - offset; + if (nr > len) + nr = len; - page = mapping->a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (!page) - goto no_xip_page; - if (unlikely(IS_ERR(page))) { - if (PTR_ERR(page) == -ENODATA) { + error = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(error)) { + if (error == -ENODATA) { /* sparse */ - page = ZERO_PAGE(0); - } else { - desc->error = PTR_ERR(page); + zero = 1; + } else goto out; - } } /* If users can be writing to this page using arbitrary @@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping, * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + /* address based flush */ ; /* - * Ok, we have the page, so now we can copy it to user space... + * Ok, we have the mem, so now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer @@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping, * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); - offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + if (!zero) + left = __copy_to_user(buf+copied, xip_mem+offset, nr); + else + left = __clear_user(buf + copied, nr); - if (ret == nr && desc->count) - continue; - goto out; + if (left) { + error = -EFAULT; + goto out; + } -no_xip_page: - /* Did not get the page. Report it */ - desc->error = -EIO; - goto out; - } + copied += (nr - left); + offset += (nr - left); + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + } while (copied < len); out: - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + *ppos = pos + copied; if (filp) file_accessed(filp); + + return (copied ? copied : error); } ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { - read_descriptor_t desc; - if (!access_ok(VERIFY_WRITE, buf, len)) return -EFAULT; - desc.written = 0; - desc.arg.buf = buf; - desc.count = len; - desc.error = 0; - - do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, - ppos, &desc, file_read_actor); - - if (desc.written) - return desc.written; - else - return desc.error; + return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, + buf, len, ppos); } EXPORT_SYMBOL_GPL(xip_file_read); @@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping, * * This function is derived from filemap_fault, but used for execute in place */ -static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) +static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct file *file = area->vm_file; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; pgoff_t size; + void *xip_mem; + unsigned long xip_pfn; + struct page *page; + int error; /* XXX: are VM_FAULT_ codes OK? */ @@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; - page = mapping->a_ops->get_xip_page(mapping, - vmf->pgoff*(PAGE_SIZE/512), 0); - if (!IS_ERR(page)) - goto out; - if (PTR_ERR(page) != -ENODATA) + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (likely(!error)) + goto found; + if (error != -ENODATA) return VM_FAULT_OOM; /* sparse block */ - if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && - (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && + if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && + (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { + int err; + /* maybe shared writable, allocate new block */ - page = mapping->a_ops->get_xip_page(mapping, - vmf->pgoff*(PAGE_SIZE/512), 1); - if (IS_ERR(page)) + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, + &xip_mem, &xip_pfn); + if (error) return VM_FAULT_SIGBUS; - /* unmap page at pgoff from all other vmas */ + /* unmap sparse mappings at pgoff from all other vmas */ __xip_unmap(mapping, vmf->pgoff); + +found: + err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, + xip_pfn); + if (err == -ENOMEM) + return VM_FAULT_OOM; + BUG_ON(err); + return VM_FAULT_NOPAGE; } else { /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) return VM_FAULT_OOM; - } -out: - page_cache_get(page); - vmf->page = page; - return 0; + page_cache_get(page); + vmf->page = page; + return 0; + } } static struct vm_operations_struct xip_file_vm_ops = { @@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = { int xip_file_mmap(struct file * file, struct vm_area_struct * vma) { - BUG_ON(!file->f_mapping->a_ops->get_xip_page); + BUG_ON(!file->f_mapping->a_ops->get_xip_mem); file_accessed(file); vma->vm_ops = &xip_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; + vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; return 0; } EXPORT_SYMBOL_GPL(xip_file_mmap); @@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf, const struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; - struct page *page; size_t bytes; ssize_t written = 0; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); do { unsigned long index; unsigned long offset; size_t copied; - char *kaddr; + void *xip_mem; + unsigned long xip_pfn; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ index = pos >> PAGE_CACHE_SHIFT; @@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf, if (bytes > count) bytes = count; - page = a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { + status = a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (status == -ENODATA) { /* we allocate a new page unmap it */ - page = a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 1); - if (!IS_ERR(page)) + status = a_ops->get_xip_mem(mapping, index, 1, + &xip_mem, &xip_pfn); + if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); } - if (IS_ERR(page)) { - status = PTR_ERR(page); + if (status) break; - } - fault_in_pages_readable(buf, bytes); - kaddr = kmap_atomic(page, KM_USER0); copied = bytes - - __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page); + __copy_from_user_nocache(xip_mem + offset, buf, bytes); if (likely(copied > 0)) { status = copied; @@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write); /* * truncate a page used for execute in place - * functionality is analog to block_truncate_page but does use get_xip_page + * functionality is analog to block_truncate_page but does use get_xip_mem * to get the page instead of page cache */ int @@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from) unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize; unsigned length; - struct page *page; + void *xip_mem; + unsigned long xip_pfn; + int err; - BUG_ON(!mapping->a_ops->get_xip_page); + BUG_ON(!mapping->a_ops->get_xip_mem); blocksize = 1 << mapping->host->i_blkbits; length = offset & (blocksize - 1); @@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from) length = blocksize - length; - page = mapping->a_ops->get_xip_page(mapping, - index*(PAGE_SIZE/512), 0); - if (!page) - return -ENOMEM; - if (unlikely(IS_ERR(page))) { - if (PTR_ERR(page) == -ENODATA) + err = mapping->a_ops->get_xip_mem(mapping, index, 0, + &xip_mem, &xip_pfn); + if (unlikely(err)) { + if (err == -ENODATA) /* Hole? No need to truncate */ return 0; else - return PTR_ERR(page); + return err; } - zero_user(page, offset, length); + memset(xip_mem + offset, 0, length); return 0; } EXPORT_SYMBOL_GPL(xip_truncate_page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51c9e2c0164..df28c1773fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, int nid; struct page *page = NULL; struct mempolicy *mpol; + nodemask_t *nodemask; struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol); - struct zone **z; - - for (z = zonelist->zones; *z; z++) { - nid = zone_to_nid(*z); - if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && + htlb_alloc_mask, &mpol, &nodemask); + struct zone *zone; + struct zoneref *z; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + MAX_NR_ZONES - 1, nodemask) { + nid = zone_to_nid(zone); + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && !list_empty(&hugepage_freelists[nid])) { page = list_entry(hugepage_freelists[nid].next, struct page, lru); @@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, break; } } - mpol_free(mpol); /* unref if mpol !NULL */ + mpol_cond_put(mpol); return page; } @@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page) } set_compound_page_dtor(page, NULL); set_page_refcounted(page); + arch_release_hugepage(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -198,6 +202,10 @@ static struct page *alloc_fresh_huge_page_node(int nid) htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, HUGETLB_PAGE_ORDER); if (page) { + if (arch_prepare_hugepage(page)) { + __free_pages(page, HUGETLB_PAGE_ORDER); + return 0; + } set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); nr_huge_pages++; @@ -239,6 +247,11 @@ static int alloc_fresh_huge_page(void) hugetlb_next_nid = next_nid; } while (!page && hugetlb_next_nid != start_nid); + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGAL |