diff options
Diffstat (limited to 'mm/madvise.c')
| -rw-r--r-- | mm/madvise.c | 207 |
1 files changed, 170 insertions, 37 deletions
diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db7..a402f8fdc68 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,8 +11,14 @@ #include <linux/mempolicy.h> #include <linux/page-isolation.h> #include <linux/hugetlb.h> +#include <linux/falloc.h> #include <linux/sched.h> #include <linux/ksm.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/swapops.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -36,11 +42,11 @@ static int madvise_need_mmap_write(int behavior) * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, +static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; int error = 0; pgoff_t pgoff; unsigned long new_flags = vma->vm_flags; @@ -65,12 +71,28 @@ static long madvise_behavior(struct vm_area_struct * vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; + break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) goto out; break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -112,15 +134,105 @@ out: return error; } +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + pte_t *orig_pte; + struct vm_area_struct *vma = walk->private; + unsigned long index; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + pte = *(orig_pte + ((index - start) / PAGE_SIZE)); + pte_unmap_unlock(orig_pte, ptl); + + if (pte_present(pte) || pte_none(pte) || pte_file(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index); + if (page) + page_cache_release(page); + } + + return 0; +} + +static void force_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_walk walk = { + .mm = vma->vm_mm, + .pmd_entry = swapin_walk_pmd_entry, + .private = vma, + }; + + walk_page_range(start, end, &walk); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + pgoff_t index; + struct page *page; + swp_entry_t swap; + + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + page = find_get_entry(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); + continue; + } + swap = radix_to_swp_entry(page); + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0); + if (page) + page_cache_release(page); + } + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; +#ifdef CONFIG_SWAP + if (!file || mapping_cap_swap_backed(file->f_mapping)) { + *prev = vma; + if (!file) + force_swapin_readahead(vma, start, end); + else + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#endif + if (!file) return -EBADF; @@ -158,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; @@ -188,33 +300,39 @@ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end) { - struct address_space *mapping; - loff_t offset, endoff; + loff_t offset; int error; + struct file *f; *prev = NULL; /* tell sys_madvise we drop mmap_sem */ if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) return -EINVAL; - if (!vma->vm_file || !vma->vm_file->f_mapping - || !vma->vm_file->f_mapping->host) { + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL; } if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) return -EACCES; - mapping = vma->vm_file->f_mapping; - offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - endoff = (loff_t)(end - vma->vm_start - 1) - + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); up_read(¤t->mm->mmap_sem); - error = vmtruncate_range(mapping->host, offset, endoff); + error = do_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); + fput(f); down_read(¤t->mm->mmap_sem); return error; } @@ -225,29 +343,35 @@ static long madvise_remove(struct vm_area_struct *vma, */ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { - int ret = 0; - + struct page *p; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE) { - struct page *p; - int ret = get_user_pages_fast(start, 1, 0, &p); + for (; start < end; start += PAGE_SIZE << + compound_order(compound_head(p))) { + int ret; + + ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + + if (PageHWPoison(p)) { + put_page(p); + continue; + } if (bhv == MADV_SOFT_OFFLINE) { - printk(KERN_INFO "Soft offlining page %lx at %lx\n", + pr_info("Soft offlining page %#lx at %#lx\n", page_to_pfn(p), start); ret = soft_offline_page(p, MF_COUNT_INCREASED); if (ret) - break; + return ret; continue; } - printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); /* Ignore return value for now */ - __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } - return ret; + return 0; } #endif @@ -283,6 +407,12 @@ madvise_behavior_valid(int behavior) case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif + case MADV_DONTDUMP: + case MADV_DODUMP: return 1; default: @@ -335,11 +465,12 @@ madvise_behavior_valid(int behavior) SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; - struct vm_area_struct * vma, *prev; + struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; int write; size_t len; + struct blk_plug plug; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -348,27 +479,27 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else - down_read(¤t->mm->mmap_sem); - if (start & ~PAGE_MASK) - goto out; + return error; len = (len_in + ~PAGE_MASK) & PAGE_MASK; /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - goto out; + return error; end = start + len; if (end < start) - goto out; + return error; error = 0; if (end == start) - goto out; + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); /* * If the interval [start,end) covers some unmapped address @@ -379,6 +510,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (vma && start > vma->vm_start) prev = vma; + blk_start_plug(&plug); for (;;) { /* Still start < end. */ error = -ENOMEM; @@ -414,6 +546,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) vma = find_vma(current->mm, start); } out: + blk_finish_plug(&plug); if (write) up_write(¤t->mm->mmap_sem); else |
