aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2006-01-27 17:18:29 -0500
committerLen Brown <len.brown@intel.com>2006-01-27 17:18:29 -0500
commit292dd876ee765c478b27c93cc51e93a558ed58bf (patch)
tree5b740e93253295baee2a9c414a6c66d03d44a9ef /mm
parentd4ec6c7cc9a15a7a529719bc3b84f46812f9842e (diff)
parent9fdb62af92c741addbea15545f214a6e89460865 (diff)
Pull release into acpica branch
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/Makefile6
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c82
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/memory.c6
-rw-r--r--mm/mempolicy.c590
-rw-r--r--mm/mlock.c1
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/msync.c2
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c174
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/rmap.c17
-rw-r--r--mm/shmem.c45
-rw-r--r--mm/slab.c1194
-rw-r--r--mm/slob.c385
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c28
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c40
-rw-r--r--mm/tiny-shmem.c2
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmscan.c381
28 files changed, 2198 insertions, 848 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137e..a9cb80ae640 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
default "4"
+
+#
+# support for page migration
+#
+config MIGRATION
+ def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+ depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f2..9aa03fa1dcc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
- readahead.o slab.o swap.o truncate.o vmscan.o \
- prio_tree.o $(mmu-y)
+ readahead.o swap.o truncate.o vmscan.o \
+ prio_tree.o util.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5a..d257c89e770 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
if (!file)
return -EBADF;
+ if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+ ret = -ESPIPE;
+ goto out;
+ }
+
mapping = file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a39768..44da3d47699 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/aio.h>
+#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->swap_lock (exclusive_swap_page, others)
* ->mapping->tree_lock
*
- * ->i_sem
+ * ->i_mutex
* ->i_mmap_lock (truncate->unmap_mapping_range)
*
* ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->lock_page (access_process_vm)
*
* ->mmap_sem
- * ->i_sem (msync)
+ * ->i_mutex (msync)
*
- * ->i_sem
+ * ->i_mutex
* ->i_alloc_sem (various)
*
* ->inode_lock
@@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
+ * ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* ->inode_lock (page_remove_rmap->set_page_dirty)
@@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
* integrity" operation. It waits upon in-flight writeout before starting and
* waiting upon new writeout. If there was an IO error, return it.
*
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
* it is otherwise livelockable.
*/
int sync_page_range(struct inode *inode, struct address_space *mapping,
- loff_t pos, size_t count)
+ loff_t pos, loff_t count)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
return 0;
ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
if (ret == 0) {
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
}
if (ret == 0)
ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
EXPORT_SYMBOL(sync_page_range);
/*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
* as it forces O_SYNC writers to different parts of the same file
* to be serialised right until io completion.
*/
-static int sync_page_range_nolock(struct inode *inode,
- struct address_space *mapping,
- loff_t pos, size_t count)
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
+ loff_t pos, loff_t count)
{
pgoff_t start = pos >> PAGE_CACHE_SHIFT;
pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode,
ret = wait_on_page_writeback_range(mapping, start, end);
return ret;
}
+EXPORT_SYMBOL(sync_page_range_nolock);
/**
* filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
- int retval = 0;
+ int err = 0;
if (mapping->nrpages) {
- retval = filemap_fdatawrite(mapping);
- if (retval == 0)
- retval = filemap_fdatawait(mapping);
+ err = filemap_fdatawrite(mapping);
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err != -EIO) {
+ int err2 = filemap_fdatawait(mapping);
+ if (!err)
+ err = err2;
+ }
}
- return retval;
+ return err;
}
+EXPORT_SYMBOL(filemap_write_and_wait);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- int retval = 0;
+ int err = 0;
if (mapping->nrpages) {
- retval = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
- if (retval == 0)
- retval = wait_on_page_writeback_range(mapping,
- lstart >> PAGE_CACHE_SHIFT,
- lend >> PAGE_CACHE_SHIFT);
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO) {
+ int err2 = wait_on_page_writeback_range(mapping,
+ lstart >> PAGE_CACHE_SHIFT,
+ lend >> PAGE_CACHE_SHIFT);
+ if (!err)
+ err = err2;
+ }
}
- return retval;
+ return err;
}
/*
@@ -1878,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
/*
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
- * i_sem is held, which protects generic_osync_inode() from
+ * i_mutex is held, which protects generic_osync_inode() from
* livelocking.
*/
if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2094,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
- inode_update_time(inode, 1);
+ file_update_time(file);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2181,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
BUG_ON(iocb->ki_pos != pos);
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
&iocb->ki_pos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
@@ -2206,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
struct iovec local_iov = { .iov_base = (void __user *)buf,
.iov_len = count };
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
@@ -2242,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
struct inode *inode = mapping->host;
ssize_t ret;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
int err;
@@ -2258,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
EXPORT_SYMBOL(generic_file_writev);
/*
- * Called under i_sem for writes to S_ISREG files. Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
* went wrong during pagecache shootdown.
*/
static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29..b960ac8e591 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
*ppos = pos;
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_sem.
+ * cannot change under us because we hold i_mutex.
*/
if (pos > inode->i_size) {
i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
loff_t pos;
ssize_t ret;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
if (ret)
goto out_backing;
- inode_update_time(inode, 1);
+ file_update_time(filp);
ret = __xip_file_write (filp, buf, count, pos, ppos);
out_backing:
current->backing_dev_info = NULL;
out_up:
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f4c43d7980b..b21d78c941b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -12,6 +12,7 @@
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
for (z = zonelist->zones; *z; z++) {
nid = (*z)->zone_pgdat->node_id;
- if (!list_empty(&hugepage_freelists[nid]))
+ if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+ !list_empty(&hugepage_freelists[nid]))
break;
}
diff --git a/mm/memory.c b/mm/memory.c
index 7197f9bcd38..7a11ddd5060 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1784,13 +1784,13 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
if (!inode->i_op || !inode->i_op->truncate_range)
return -ENOSYS;
- down(&inode->i_sem);
+ mutex_lock(&inode->i_mutex);
down_write(&inode->i_alloc_sem);
unmap_mapping_range(mapping, offset, (end - offset), 1);
truncate_inode_pages_range(mapping, offset, end);
inode->i_op->truncate_range(inode, offset, end);
up_write(&inode->i_alloc_sem);
- up(&inode->i_sem);
+ mutex_unlock(&inode->i_mutex);
return 0;
}
@@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952..73790188b0e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
+
static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache;
@@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
break;
}
policy->policy = mode;
+ policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
+
+/* Scan through pages checking if pages follow certain conditions. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pte_t *orig_pte;
pte_t *pte;
@@ -192,8 +208,28 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
+ if (PageReserved(page))
+ continue;
nid = page_to_nid(page);
- if (!node_isset(nid, *nodes))
+ if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ continue;
+
+ if (flags & MPOL_MF_STATS)
+ gather_stats(page, private);
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, private, flags);
+ else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
@@ -201,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pmd_t *pmd;
unsigned long next;
@@ -211,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- if (check_pte_range(vma, pmd, addr, next, nodes))
+ if (check_pte_range(vma, pmd, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pud_t *pud;
unsigned long next;
@@ -228,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (check_pmd_range(vma, pud, addr, next, nodes))
+ if (check_pmd_range(vma, pud, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int check_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end, nodemask_t *nodes)
+ unsigned long addr, unsigned long end,
+ const nodemask_t *nodes, unsigned long flags,
+ void *private)
{
pgd_t *pgd;
unsigned long next;
@@ -245,36 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (check_pud_range(vma, pgd, addr, next, nodes))
+ if (check_pud_range(vma, pgd, addr, next, nodes,
+ flags, private))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
}
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (
+ VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+ return 0;
+ return 1;
+}
+
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
static struct vm_area_struct *
check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- nodemask_t *nodes, unsigned long flags)
+ const nodemask_t *nodes, unsigned long flags, void *private)
{
int err;
struct vm_area_struct *first, *vma, *prev;
+ /* Clear the LRU lists so pages can be isolated */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_add_drain_all();
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
- if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
- if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
- if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return ERR_PTR(-EFAULT);
+ if (prev && prev->vm_end < vma->vm_start)
+ return ERR_PTR(-EFAULT);
+ }
+ if (!is_vm_hugetlb_page(vma) &&
+ ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))) {
unsigned long endvma = vma->vm_end;
+
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
- err = check_pgd_range(vma, start, endvma, nodes);
+ err = check_pgd_range(vma, start, endvma, nodes,
+ flags, private);
if (err) {
first = ERR_PTR(err);
break;
@@ -333,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
if (!nodes)
return 0;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- cpuset_restrict_to_mems_allowed(nodes->bits);
- return mpol_check_policy(mode, nodes);
-}
-
-long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- struct mempolicy *new;
- unsigned long end;
- int err;
-
- if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
- return -EINVAL;
- if (start & ~PAGE_MASK)
+ cpuset_update_task_memory_state();
+ if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
return -EINVAL;
- if (mode == MPOL_DEFAULT)
- flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
- end = start + len;
- if (end < start)
- return -EINVAL;
- if (end == start)
- return 0;
- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
- new = mpol_new(mode, nmask);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
-
- down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nmask, flags);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma))
- err = mbind_range(vma, start, end, new);
- up_write(&mm->mmap_sem);
- mpol_free(new);
- return err;
+ return mpol_check_policy(mode, nodes);
}
/* Set the process memory policy */
@@ -448,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
if (flags & MPOL_F_ADDR) {
@@ -500,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
}
/*
+ * page migration
+ */
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (isolate_lru_page(page))
+ list_add(&page->lru, pagelist);
+ }
+}
+
+static int swap_pages(struct list_head *pagelist)
+{
+ LIST_HEAD(moved);
+ LIST_HEAD(failed);
+ int n;
+
+ n = migrate_pages(pagelist, NULL, &moved, &failed);
+ putback_lru_pages(&failed);
+ putback_lru_pages(&moved);
+
+ return n;
+}
+
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ LIST_HEAD(pagelist);
+ int count = 0;
+ nodemask_t nodes;
+
+ nodes_andnot(nodes, *from_nodes, *to_nodes);
+
+ down_read(&mm->mmap_sem);
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist)) {
+ count = swap_pages(&pagelist);
+ putback_lru_pages(&pagelist);
+ }
+
+ up_read(&mm->mmap_sem);
+ return count;
+}
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ LIST_HEAD(pagelist);
+
+ if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ || mode > MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+
+ new = mpol_new(mode, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode,nodes_addr(nodes)[0]);
+
+ down_write(&mm->mmap_sem);
+ vma = check_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ err = PTR_ERR(vma);
+ if (!IS_ERR(vma)) {
+ int nr_failed = 0;
+
+ err = mbind_range(vma, start, end, new);
+ if (!list_empty(&pagelist))
+ nr_failed = swap_pages(&pagelist);
+
+ if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ }
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+
+ up_write(&mm->mmap_sem);
+ mpol_free(new);
+ return err;
+}
+
+/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
@@ -593,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
return do_set_mempolicy(mode, &nodes);
}
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+ const unsigned long __user *old_nodes,
+ const unsigned long __user *new_nodes)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ nodemask_t old;
+ nodemask_t new;
+ nodemask_t task_nodes;
+ int err;
+
+ err = get_nodes(&old, old_nodes, maxnode);
+ if (err)
+ return err;
+
+ err = get_nodes(&new, new_nodes, maxnode);
+ if (err)
+ return err;
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_pid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
+
+ if (!mm)
+ return -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser priviledges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ task_nodes = cpuset_mems_allowed(task);
+ /* Is the user allowed to access the target nodes? */
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+ mmput(mm);
+ return err;
+}
+
+
/* Retrieve NUMA policy */
asmlinkage long sys_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
@@ -699,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
#endif
/* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+ struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
@@ -759,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
return nid;
}
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ switch (policy->policy) {
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND:
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+ case MPOL_PREFERRED:
+ if (policy->v.preferred_node >= 0)
+ return policy->v.preferred_node;
+ /* Fall through */
+
+ default:
+ return numa_node_id();
+ }
+}
+
/* Do static interleaving for a VMA with known offset. */
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
@@ -848,7 +1092,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -874,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
* interrupt con