diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 7 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/fadvise.c | 5 | ||||
-rw-r--r-- | mm/filemap.c | 82 | ||||
-rw-r--r-- | mm/filemap_xip.c | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 590 | ||||
-rw-r--r-- | mm/mlock.c | 1 | ||||
-rw-r--r-- | mm/mmap.c | 1 | ||||
-rw-r--r-- | mm/mremap.c | 1 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 174 | ||||
-rw-r--r-- | mm/pdflush.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 17 | ||||
-rw-r--r-- | mm/shmem.c | 45 | ||||
-rw-r--r-- | mm/slab.c | 1194 | ||||
-rw-r--r-- | mm/slob.c | 385 | ||||
-rw-r--r-- | mm/sparse.c | 4 | ||||
-rw-r--r-- | mm/swap.c | 28 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 40 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/util.c | 39 | ||||
-rw-r--r-- | mm/vmscan.c | 381 |
28 files changed, 2198 insertions, 848 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b3db11f137e..a9cb80ae640 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS default "4096" if ARM && !CPU_CACHE_VIPT default "4096" if PARISC && !PA20 default "4" + +# +# support for page migration +# +config MIGRATION + def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM + depends on SWAP diff --git a/mm/Makefile b/mm/Makefile index 2fa6d2ca9f2..9aa03fa1dcc 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o \ - readahead.o slab.o swap.o truncate.o vmscan.o \ - prio_tree.o $(mmu-y) + readahead.o swap.o truncate.o vmscan.o \ + prio_tree.o util.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o @@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o +obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/fadvise.c b/mm/fadvise.c index 5f19e87bc5a..d257c89e770 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (!file) return -EBADF; + if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { + ret = -ESPIPE; + goto out; + } + mapping = file->f_mapping; if (!mapping || len < 0) { ret = -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index 4ef24a39768..44da3d47699 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/aio.h> +#include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/swap.h> @@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * - * ->i_sem + * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) * * ->mmap_sem @@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->lock_page (access_process_vm) * * ->mmap_sem - * ->i_sem (msync) + * ->i_mutex (msync) * - * ->i_sem + * ->i_mutex * ->i_alloc_sem (various) * * ->inode_lock @@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) @@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping, * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. * - * We need to re-take i_sem during the generic_osync_inode list walk because + * We need to re-take i_mutex during the generic_osync_inode list walk because * it is otherwise livelockable. */ int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, size_t count) + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, return 0; ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); if (ret == 0) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } if (ret == 0) ret = wait_on_page_writeback_range(mapping, start, end); @@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range); /* - * Note: Holding i_sem across sync_page_range_nolock is not a good idea + * Note: Holding i_mutex across sync_page_range_nolock is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */ -static int sync_page_range_nolock(struct inode *inode, - struct address_space *mapping, - loff_t pos, size_t count) +int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, + loff_t pos, loff_t count) { pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; @@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode, ret = wait_on_page_writeback_range(mapping, start, end); return ret; } +EXPORT_SYMBOL(sync_page_range_nolock); /** * filemap_fdatawait - walk the list of under-writeback pages of the given @@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait); int filemap_write_and_wait(struct address_space *mapping) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } } - return retval; + return err; } +EXPORT_SYMBOL(filemap_write_and_wait); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int retval = 0; + int err = 0; if (mapping->nrpages) { - retval = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); - if (retval == 0) - retval = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = wait_on_page_writeback_range(mapping, + lstart >> PAGE_CACHE_SHIFT, + lend >> PAGE_CACHE_SHIFT); + if (!err) + err = err2; + } } - return retval; + return err; } /* @@ -1878,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. - * i_sem is held, which protects generic_osync_inode() from + * i_mutex is held, which protects generic_osync_inode() from * livelocking. */ if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2094,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; - inode_update_time(inode, 1); + file_update_time(file); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { @@ -2181,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2206,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf, struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2242,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, struct inode *inode = mapping->host; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err; @@ -2258,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, EXPORT_SYMBOL(generic_file_writev); /* - * Called under i_sem for writes to S_ISREG files. Returns -EIO if something + * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. */ static ssize_t diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9cf687e4a29..b960ac8e591 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf, *ppos = pos; /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_sem. + * cannot change under us because we hold i_mutex. */ if (pos > inode->i_size) { i_size_write(inode, pos); @@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, loff_t pos; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (!access_ok(VERIFY_READ, buf, len)) { ret=-EFAULT; @@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (ret) goto out_backing; - inode_update_time(inode, 1); + file_update_time(filp); ret = __xip_file_write (filp, buf, count, pos, ppos); out_backing: current->backing_dev_info = NULL; out_up: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return ret; } EXPORT_SYMBOL_GPL(xip_file_write); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f4c43d7980b..b21d78c941b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -12,6 +12,7 @@ #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> +#include <linux/cpuset.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, for (z = zonelist->zones; *z; z++) { nid = (*z)->zone_pgdat->node_id; - if (!list_empty(&hugepage_freelists[nid])) + if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && + !list_empty(&hugepage_freelists[nid])) break; } diff --git a/mm/memory.c b/mm/memory.c index 7197f9bcd38..7a11ddd5060 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1784,13 +1784,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) if (!inode->i_op || !inode->i_op->truncate_range) return -ENOSYS; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); down_write(&inode->i_alloc_sem); unmap_mapping_range(mapping, offset, (end - offset), 1); truncate_inode_pages_range(mapping, offset, end); inode->i_op->truncate_range(inode, offset, end); up_write(&inode->i_alloc_sem); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return 0; } @@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } +EXPORT_SYMBOL_GPL(__handle_mm_fault); + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f1d2b8a952..73790188b0e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -83,9 +83,18 @@ #include <linux/init.h> #include <linux/compat.h> #include <linux/mempolicy.h> +#include <linux/swap.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + #include <asm/tlbflush.h> #include <asm/uaccess.h> +/* Internal flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ + static kmem_cache_t *policy_cache; static kmem_cache_t *sn_cache; @@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) break; } policy->policy = mode; + policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } -/* Ensure all existing pages follow the policy. */ +static void gather_stats(struct page *, void *); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); + +/* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) { pte_t *orig_pte; pte_t *pte; @@ -192,8 +208,28 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ + if (PageReserved(page)) + continue; nid = page_to_nid(page); - if (!node_isset(nid, *nodes)) + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & MPOL_MF_STATS) + gather_stats(page, private); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); + else break; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); @@ -201,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) { pmd_t *pmd; unsigned long next; @@ -211,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes)) + if (check_pte_range(vma, pmd, addr, next, nodes, + flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) { pud_t *pud; unsigned long next; @@ -228,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes)) + if (check_pmd_range(vma, pud, addr, next, nodes, + flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } static inline int check_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, nodemask_t *nodes) + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) { pgd_t *pgd; unsigned long next; @@ -245,36 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes)) + if (check_pud_range(vma, pgd, addr, next, nodes, + flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; } -/* Step 1: check the range */ +/* Check if a vma is migratable */ +static inline int vma_migratable(struct vm_area_struct *vma) +{ + if (vma->vm_flags & ( + VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) + return 0; + return 1; +} + +/* + * Check if all pages in a range are on a set of nodes. + * If pagelist != NULL then isolate pages from the LRU and + * put them on the pagelist. + */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - nodemask_t *nodes, unsigned long flags) + const nodemask_t *nodes, unsigned long flags, void *private) { int err; struct vm_area_struct *first, *vma, *prev; + /* Clear the LRU lists so pages can be isolated */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_add_drain_all(); + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); - if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); - if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + } + if (!is_vm_hugetlb_page(vma) && + ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma)))) { unsigned long endvma = vma->vm_end; + if (endvma > end) endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma, start, endvma, nodes); + err = check_pgd_range(vma, start, endvma, nodes, + flags, private); if (err) { first = ERR_PTR(err); break; @@ -333,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes) if (!nodes) return 0; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes->bits); - return mpol_check_policy(mode, nodes); -} - -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct mempolicy *new; - unsigned long end; - int err; - - if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) - return -EINVAL; - if (start & ~PAGE_MASK) + cpuset_update_task_memory_state(); + if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) return -EINVAL; - if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; - end = start + len; - if (end < start) - return -EINVAL; - if (end == start) - return 0; - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - new = mpol_new(mode, nmask); - if (IS_ERR(new)) - return PTR_ERR(new); - - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); - - down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, flags); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) - err = mbind_range(vma, start, end, new); - up_write(&mm->mmap_sem); - mpol_free(new); - return err; + return mpol_check_policy(mode, nodes); } /* Set the process memory policy */ @@ -448,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; if (flags & MPOL_F_ADDR) { @@ -500,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, } /* + * page migration + */ + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (isolate_lru_page(page)) + list_add(&page->lru, pagelist); + } +} + +static int swap_pages(struct list_head *pagelist) +{ + LIST_HEAD(moved); + LIST_HEAD(failed); + int n; + + n = migrate_pages(pagelist, NULL, &moved, &failed); + putback_lru_pages(&failed); + putback_lru_pages(&moved); + + return n; +} + +/* + * For now migrate_pages simply swaps out the pages from nodes that are in + * the source set but not in the target set. In the future, we would + * want a function that moves pages between the two nodesets in such + * a way as to preserve the physical layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + LIST_HEAD(pagelist); + int count = 0; + nodemask_t nodes; + + nodes_andnot(nodes, *from_nodes, *to_nodes); + + down_read(&mm->mmap_sem); + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) { + count = swap_pages(&pagelist); + putback_lru_pages(&pagelist); + } + + up_read(&mm->mmap_sem); + return count; +} + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + + new = mpol_new(mode, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes_addr(nodes)[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + if (!list_empty(&pagelist)) + nr_failed = swap_pages(&pagelist); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); + + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + +/* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; @@ -593,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser priviledges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + /* Retrieve NUMA policy */ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -699,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif /* Return effective policy for a VMA */ -struct mempolicy * -get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy * get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; @@ -759,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + switch (policy->policy) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return policy->v.zonelist->zones[0]->zone_pgdat->node_id; + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) @@ -848,7 +1092,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; @@ -874,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * - * Don't call cpuset_update_current_mems_allowed() unless + * Don't call cpuset_update_task_memory_state() unless * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */ @@ -883,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) struct mempolicy *pol = current->mempolicy; if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (!pol || in_interrupt()) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) @@ -892,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This |