diff options
Diffstat (limited to 'mm/mempolicy.c')
| -rw-r--r-- | mm/mempolicy.c | 1238 | 
1 files changed, 712 insertions, 526 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76..8f5330d74f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -26,7 +26,7 @@   *                the allocation to memory nodes instead   *   * preferred       Try a specific node first before normal fallback. - *                As a special case node -1 here means do the allocation + *                As a special case NUMA_NO_NODE here means do the allocation   *                on the local CPU. This is normally identical to default,   *                but useful to set in a VMA when you have a non default   *                process policy. @@ -65,6 +65,8 @@     kernel is not always grateful with that.  */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/mempolicy.h>  #include <linux/mm.h>  #include <linux/highmem.h> @@ -75,7 +77,7 @@  #include <linux/cpuset.h>  #include <linux/slab.h>  #include <linux/string.h> -#include <linux/module.h> +#include <linux/export.h>  #include <linux/nsproxy.h>  #include <linux/interrupt.h>  #include <linux/init.h> @@ -90,16 +92,18 @@  #include <linux/syscalls.h>  #include <linux/ctype.h>  #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> +#include <linux/printk.h>  #include <asm/tlbflush.h>  #include <asm/uaccess.h> +#include <linux/random.h>  #include "internal.h"  /* Internal flags */  #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */ -#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */  static struct kmem_cache *policy_cache;  static struct kmem_cache *sn_cache; @@ -111,12 +115,35 @@ enum zone_type policy_zone = 0;  /*   * run-time system-wide default policy => local allocation   */ -struct mempolicy default_policy = { +static struct mempolicy default_policy = {  	.refcnt = ATOMIC_INIT(1), /* never free it */  	.mode = MPOL_PREFERRED,  	.flags = MPOL_F_LOCAL,  }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ +	struct mempolicy *pol = p->mempolicy; + +	if (!pol) { +		int node = numa_node_id(); + +		if (node != NUMA_NO_NODE) { +			pol = &preferred_node_policy[node]; +			/* +			 * preferred_node_policy is not initialised early in +			 * boot +			 */ +			if (!pol->mode) +				pol = NULL; +		} +	} + +	return pol; +} +  static const struct mempolicy_operations {  	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);  	/* @@ -140,19 +167,7 @@ static const struct mempolicy_operations {  /* Check that the nodemask contains at least one populated zone */  static int is_valid_nodemask(const nodemask_t *nodemask)  { -	int nd, k; - -	for_each_node_mask(nd, *nodemask) { -		struct zone *z; - -		for (k = 0; k <= policy_zone; k++) { -			z = &NODE_DATA(nd)->node_zones[k]; -			if (z->present_pages > 0) -				return 1; -		} -	} - -	return 0; +	return nodes_intersects(*nodemask, node_states[N_MEMORY]);  }  static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -212,9 +227,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,  	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */  	if (pol == NULL)  		return 0; -	/* Check N_HIGH_MEMORY */ +	/* Check N_MEMORY */  	nodes_and(nsc->mask1, -		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); +		  cpuset_current_mems_allowed, node_states[N_MEMORY]);  	VM_BUG_ON(!nodes);  	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) @@ -249,12 +264,12 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  	struct mempolicy *policy;  	pr_debug("setting mode %d flags %d nodes[0] %lx\n", -		 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); +		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);  	if (mode == MPOL_DEFAULT) {  		if (nodes && !nodes_empty(*nodes))  			return ERR_PTR(-EINVAL); -		return NULL;	/* simply delete any existing policy */ +		return NULL;  	}  	VM_BUG_ON(!nodes); @@ -269,6 +284,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,  			     (flags & MPOL_F_RELATIVE_NODES)))  				return ERR_PTR(-EINVAL);  		} +	} else if (mode == MPOL_LOCAL) { +		if (!nodes_empty(*nodes)) +			return ERR_PTR(-EINVAL); +		mode = MPOL_PREFERRED;  	} else if (nodes_empty(*nodes))  		return ERR_PTR(-EINVAL);  	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -390,7 +409,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,  {  	if (!pol)  		return; -	if (!mpol_store_user_nodemask(pol) && step == 0 && +	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&  	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))  		return; @@ -457,12 +476,14 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {  	},  }; -static void gather_stats(struct page *, void *, int pte_dirty);  static void migrate_page_add(struct page *page, struct list_head *pagelist,  				unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  		unsigned long addr, unsigned long end,  		const nodemask_t *nodes, unsigned long flags,  		void *private) @@ -484,17 +505,14 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  		/*  		 * vm_normal_page() filters out zero pages, but there might  		 * still be PageReserved pages to skip, perhaps in a VDSO. -		 * And we cannot move PageKsm pages sensibly or safely yet.  		 */ -		if (PageReserved(page) || PageKsm(page)) +		if (PageReserved(page))  			continue;  		nid = page_to_nid(page);  		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))  			continue; -		if (flags & MPOL_MF_STATS) -			gather_stats(page, private, pte_dirty(*pte)); -		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) +		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))  			migrate_page_add(page, private, flags);  		else  			break; @@ -503,7 +521,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,  	return addr != end;  } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, +		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, +				    void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE +	int nid; +	struct page *page; +	spinlock_t *ptl; +	pte_t entry; + +	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); +	entry = huge_ptep_get((pte_t *)pmd); +	if (!pte_present(entry)) +		goto unlock; +	page = pte_page(entry); +	nid = page_to_nid(page); +	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) +		goto unlock; +	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ +	if (flags & (MPOL_MF_MOVE_ALL) || +	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) +		isolate_huge_page(page, private); +unlock: +	spin_unlock(ptl); +#else +	BUG(); +#endif +} + +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,  		unsigned long addr, unsigned long end,  		const nodemask_t *nodes, unsigned long flags,  		void *private) @@ -514,16 +561,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,  	pmd = pmd_offset(pud, addr);  	do {  		next = pmd_addr_end(addr, end); -		if (pmd_none_or_clear_bad(pmd)) +		if (!pmd_present(*pmd))  			continue; -		if (check_pte_range(vma, pmd, addr, next, nodes, +		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { +			queue_pages_hugetlb_pmd_range(vma, pmd, nodes, +						flags, private); +			continue; +		} +		split_huge_page_pmd(vma, addr, pmd); +		if (pmd_none_or_trans_huge_or_clear_bad(pmd)) +			continue; +		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,  				    flags, private))  			return -EIO;  	} while (pmd++, addr = next, addr != end);  	return 0;  } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,  		unsigned long addr, unsigned long end,  		const nodemask_t *nodes, unsigned long flags,  		void *private) @@ -534,16 +589,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,  	pud = pud_offset(pgd, addr);  	do {  		next = pud_addr_end(addr, end); +		if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) +			continue;  		if (pud_none_or_clear_bad(pud))  			continue; -		if (check_pmd_range(vma, pud, addr, next, nodes, +		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,  				    flags, private))  			return -EIO;  	} while (pud++, addr = next, addr != end);  	return 0;  } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma,  		unsigned long addr, unsigned long end,  		const nodemask_t *nodes, unsigned long flags,  		void *private) @@ -556,77 +613,128 @@ static inline int check_pgd_range(struct vm_area_struct *vma,  		next = pgd_addr_end(addr, end);  		if (pgd_none_or_clear_bad(pgd))  			continue; -		if (check_pud_range(vma, pgd, addr, next, nodes, +		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,  				    flags, private))  			return -EIO;  	} while (pgd++, addr = next, addr != end);  	return 0;  } +#ifdef CONFIG_NUMA_BALANCING  /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core.   */ -static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, -		const nodemask_t *nodes, unsigned long flags, void *private) +unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end)  { -	int err; -	struct vm_area_struct *first, *vma, *prev; +	int nr_updated; + +	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); +	if (nr_updated) +		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); +	return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, +			unsigned long addr, unsigned long end) +{ +	return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ -	first = find_vma(mm, start); -	if (!first) -		return ERR_PTR(-EFAULT); +/* + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) + */ +static int +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, +		const nodemask_t *nodes, unsigned long flags, void *private) +{ +	int err = 0; +	struct vm_area_struct *vma, *prev; + +	vma = find_vma(mm, start); +	if (!vma) +		return -EFAULT;  	prev = NULL; -	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { +	for (; vma && vma->vm_start < end; vma = vma->vm_next) { +		unsigned long endvma = vma->vm_end; + +		if (endvma > end) +			endvma = end; +		if (vma->vm_start > start) +			start = vma->vm_start; +  		if (!(flags & MPOL_MF_DISCONTIG_OK)) {  			if (!vma->vm_next && vma->vm_end < end) -				return ERR_PTR(-EFAULT); +				return -EFAULT;  			if (prev && prev->vm_end < vma->vm_start) -				return ERR_PTR(-EFAULT); +				return -EFAULT; +		} + +		if (flags & MPOL_MF_LAZY) { +			change_prot_numa(vma, start, endvma); +			goto next;  		} -		if (!is_vm_hugetlb_page(vma) && -		    ((flags & MPOL_MF_STRICT) || + +		if ((flags & MPOL_MF_STRICT) ||  		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && -				vma_migratable(vma)))) { -			unsigned long endvma = vma->vm_end; - -			if (endvma > end) -				endvma = end; -			if (vma->vm_start > start) -				start = vma->vm_start; -			err = check_pgd_range(vma, start, endvma, nodes, +		      vma_migratable(vma))) { + +			err = queue_pages_pgd_range(vma, start, endvma, nodes,  						flags, private); -			if (err) { -				first = ERR_PTR(err); +			if (err)  				break; -			}  		} +next:  		prev = vma;  	} -	return first; +	return err;  } -/* Apply policy to a single VMA */ -static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, +						struct mempolicy *pol)  { -	int err = 0; -	struct mempolicy *old = vma->vm_policy; +	int err; +	struct mempolicy *old; +	struct mempolicy *new;  	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",  		 vma->vm_start, vma->vm_end, vma->vm_pgoff,  		 vma->vm_ops, vma->vm_file,  		 vma->vm_ops ? vma->vm_ops->set_policy : NULL); -	if (vma->vm_ops && vma->vm_ops->set_policy) +	new = mpol_dup(pol); +	if (IS_ERR(new)) +		return PTR_ERR(new); + +	if (vma->vm_ops && vma->vm_ops->set_policy) {  		err = vma->vm_ops->set_policy(vma, new); -	if (!err) { -		mpol_get(new); -		vma->vm_policy = new; -		mpol_put(old); +		if (err) +			goto err_out;  	} + +	old = vma->vm_policy; +	vma->vm_policy = new; /* protected by mmap_sem */ +	mpol_put(old); + +	return 0; + err_out: +	mpol_put(new);  	return err;  } @@ -642,22 +750,34 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,  	unsigned long vmstart;  	unsigned long vmend; -	vma = find_vma_prev(mm, start, &prev); +	vma = find_vma(mm, start);  	if (!vma || vma->vm_start > start)  		return -EFAULT; +	prev = vma->vm_prev; +	if (start > vma->vm_start) +		prev = vma; +  	for (; vma && vma->vm_start < end; prev = vma, vma = next) {  		next = vma->vm_next;  		vmstart = max(start, vma->vm_start);  		vmend   = min(end, vma->vm_end); -		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); +		if (mpol_equal(vma_policy(vma), new_pol)) +			continue; + +		pgoff = vma->vm_pgoff + +			((vmstart - vma->vm_start) >> PAGE_SHIFT);  		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, -				  vma->anon_vma, vma->vm_file, pgoff, new_pol); +				  vma->anon_vma, vma->vm_file, pgoff, +				  new_pol);  		if (prev) {  			vma = prev;  			next = vma->vm_next; -			continue; +			if (mpol_equal(vma_policy(vma), new_pol)) +				continue; +			/* vma_merge() joined vma && vma->next, case 8 */ +			goto replace;  		}  		if (vma->vm_start != vmstart) {  			err = split_vma(vma->vm_mm, vma, vmstart, 1); @@ -669,7 +789,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,  			if (err)  				goto out;  		} -		err = policy_vma(vma, new_pol); + replace: +		err = vma_replace_policy(vma, new_pol);  		if (err)  			goto out;  	} @@ -678,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,  	return err;  } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy.  Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ -	if (p->mempolicy) -		p->flags |= PF_MEMPOLICY; -	else -		p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ -	mpol_fix_fork_child_flag(current); -} -  /* Set the process memory policy */  static long do_set_mempolicy(unsigned short mode, unsigned short flags,  			     nodemask_t *nodes) @@ -744,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,  	}  	old = current->mempolicy;  	current->mempolicy = new; -	mpol_set_task_struct_flag();  	if (new && new->mode == MPOL_INTERLEAVE &&  	    nodes_weight(new->v.nodes))  		current->il_next = first_node(new->v.nodes); @@ -911,7 +1001,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,  static struct page *new_node_page(struct page *page, unsigned long node, int **x)  { -	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); +	if (PageHuge(page)) +		return alloc_huge_page_node(page_hstate(compound_head(page)), +					node); +	else +		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);  }  /* @@ -924,20 +1018,24 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,  	nodemask_t nmask;  	LIST_HEAD(pagelist);  	int err = 0; -	struct vm_area_struct *vma;  	nodes_clear(nmask);  	node_set(source, nmask); -	vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, +	/* +	 * This does not "check" the range but isolates all pages that +	 * need migration.  Between passing in the full user address +	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. +	 */ +	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); +	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,  			flags | MPOL_MF_DISCONTIG_OK, &pagelist); -	if (IS_ERR(vma)) -		return PTR_ERR(vma);  	if (!list_empty(&pagelist)) { -		err = migrate_pages(&pagelist, new_node_page, dest, 0); +		err = migrate_pages(&pagelist, new_node_page, NULL, dest, +					MIGRATE_SYNC, MR_SYSCALL);  		if (err) -			putback_lru_pages(&pagelist); +			putback_movable_pages(&pagelist);  	}  	return err; @@ -949,8 +1047,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,   *   * Returns the number of page that could not be moved.   */ -int do_migrate_pages(struct mm_struct *mm, -	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, +		     const nodemask_t *to, int flags)  {  	int busy = 0;  	int err; @@ -962,7 +1060,7 @@ int do_migrate_pages(struct mm_struct *mm,  	down_read(&mm->mmap_sem); -	err = migrate_vmas(mm, from_nodes, to_nodes, flags); +	err = migrate_vmas(mm, from, to, flags);  	if (err)  		goto out; @@ -991,20 +1089,40 @@ int do_migrate_pages(struct mm_struct *mm,  	 * most recent <s, d> pair that moved (s != d).  If we find a pair  	 * that not only moved, but what's better, moved to an empty slot  	 * (d is not set in tmp), then we break out then, with that pair. -	 * Otherwise when we finish scannng from_tmp, we at least have the +	 * Otherwise when we finish scanning from_tmp, we at least have the  	 * most recent <s, d> pair that moved.  If we get all the way through  	 * the scan of tmp without finding any node that moved, much less  	 * moved to an empty node, then there is nothing left worth migrating.  	 */ -	tmp = *from_nodes; +	tmp = *from;  	while (!nodes_empty(tmp)) {  		int s,d; -		int source = -1; +		int source = NUMA_NO_NODE;  		int dest = 0;  		for_each_node_mask(s, tmp) { -			d = node_remap(s, *from_nodes, *to_nodes); + +			/* +			 * do_migrate_pages() tries to maintain the relative +			 * node relationship of the pages established between +			 * threads and memory areas. +                         * +			 * However if the number of source nodes is not equal to +			 * the number of destination nodes we can not preserve +			 * this node relative relationship.  In that case, skip +			 * copying memory from a node that is in the destination +			 * mask. +			 * +			 * Example: [2,3,4] -> [3,4,5] moves everything. +			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7. +			 */ + +			if ((nodes_weight(*from) != nodes_weight(*to)) && +						(node_isset(s, *to))) +				continue; + +			d = node_remap(s, *from, *to);  			if (s == d)  				continue; @@ -1015,7 +1133,7 @@ int do_migrate_pages(struct mm_struct *mm,  			if (!node_isset(dest, tmp))  				break;  		} -		if (source == -1) +		if (source == NUMA_NO_NODE)  			break;  		node_clear(source, tmp); @@ -1035,16 +1153,17 @@ out:  /*   * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. + * Start by assuming the page is mapped by the same vma as contains @start.   * Search forward from there, if not.  N.B., this assumes that the   * list of pages handed to migrate_pages()--which is how we get here--   * is in virtual address order.   */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x)  { -	struct vm_area_struct *vma = (struct vm_area_struct *)private; +	struct vm_area_struct *vma;  	unsigned long uninitialized_var(address); +	vma = find_vma(current->mm, start);  	while (vma) {  		address = page_address_in_vma(page, vma);  		if (address != -EFAULT) @@ -1052,6 +1171,10 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *  		vma = vma->vm_next;  	} +	if (PageHuge(page)) { +		BUG_ON(!vma); +		return alloc_huge_page_noerr(vma, address, 1); +	}  	/*  	 * if !vma, alloc_page_vma() will use task or system default policy  	 */ @@ -1064,13 +1187,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,  {  } -int do_migrate_pages(struct mm_struct *mm, -	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, +		     const nodemask_t *to, int flags)  {  	return -ENOSYS;  } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x)  {  	return NULL;  } @@ -1080,15 +1203,13 @@ static long do_mbind(unsigned long start, unsigned long len,  		     unsigned short mode, unsigned short mode_flags,  		     nodemask_t *nmask, unsigned long flags)  { -	struct vm_area_struct *vma;  	struct mm_struct *mm = current->mm;  	struct mempolicy *new;  	unsigned long end;  	int err;  	LIST_HEAD(pagelist); -	if (flags & ~(unsigned long)(MPOL_MF_STRICT | -				     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) +	if (flags & ~(unsigned long)MPOL_MF_VALID)  		return -EINVAL;  	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))  		return -EPERM; @@ -1111,6 +1232,9 @@ static long do_mbind(unsigned long start, unsigned long len,  	if (IS_ERR(new))  		return PTR_ERR(new); +	if (flags & MPOL_MF_LAZY) +		new->flags |= MPOL_F_MOF; +  	/*  	 * If we are using the default policy then operation  	 * on discontinuous address spaces is okay after all @@ -1120,7 +1244,7 @@ static long do_mbind(unsigned long start, unsigned long len,  	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",  		 start, start + len, mode, mode_flags, -		 nmask ? nodes_addr(*nmask)[0] : -1); +		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);  	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { @@ -1144,26 +1268,26 @@ static long do_mbind(unsigned long start, unsigned long len,  	if (err)  		goto mpol_out; -	vma = check_range(mm, start, end, nmask, +	err = queue_pages_range(mm, start, end, nmask,  			  flags | MPOL_MF_INVERT, &pagelist); +	if (!err) +		err = mbind_range(mm, start, end, new); -	err = PTR_ERR(vma); -	if (!IS_ERR(vma)) { +	if (!err) {  		int nr_failed = 0; -		err = mbind_range(mm, start, end, new); -  		if (!list_empty(&pagelist)) { -			nr_failed = migrate_pages(&pagelist, new_vma_page, -						(unsigned long)vma, 0); +			WARN_ON_ONCE(flags & MPOL_MF_LAZY); +			nr_failed = migrate_pages(&pagelist, new_page, NULL, +				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);  			if (nr_failed) -				putback_lru_pages(&pagelist); +				putback_movable_pages(&pagelist);  		} -		if (!err && nr_failed && (flags & MPOL_MF_STRICT)) +		if (nr_failed && (flags & MPOL_MF_STRICT))  			err = -EIO;  	} else -		putback_lru_pages(&pagelist); +		putback_movable_pages(&pagelist);  	up_write(&mm->mmap_sem);   mpol_out: @@ -1239,7 +1363,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,  }  SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, -		unsigned long, mode, unsigned long __user *, nmask, +		unsigned long, mode, const unsigned long __user *, nmask,  		unsigned long, maxnode, unsigned, flags)  {  	nodemask_t nodes; @@ -1260,7 +1384,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,  }  /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,  		unsigned long, maxnode)  {  	int err; @@ -1307,19 +1431,16 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,  		goto out;  	/* Find the mm_struct */ -	read_lock(&tasklist_lock); +	rcu_read_lock();  	task = pid ? find_task_by_vpid(pid) : current;  	if (!task) { -		read_unlock(&tasklist_lock); +		rcu_read_unlock();  		err = -ESRCH;  		goto out;  	} -	mm = get_task_mm(task); -	read_unlock(&tasklist_lock); +	get_task_struct(task);  	err = -EINVAL; -	if (!mm) -		goto out;  	/*  	 * Check if this process has the right to modify the specified @@ -1327,14 +1448,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,  	 * capabilities, superuser privileges or the same  	 * userid as the target process.  	 */ -	rcu_read_lock();  	tcred = __task_cred(task); -	if (cred->euid != tcred->suid && cred->euid != tcred->uid && -	    cred->uid  != tcred->suid && cred->uid  != tcred->uid && +	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && +	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&  	    !capable(CAP_SYS_NICE)) {  		rcu_read_unlock();  		err = -EPERM; -		goto out; +		goto out_put;  	}  	rcu_read_unlock(); @@ -1342,26 +1462,39 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,  	/* Is the user allowed to access the target nodes? */  	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {  		err = -EPERM; -		goto out; +		goto out_put;  	} -	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { +	if (!nodes_subset(*new, node_states[N_MEMORY])) {  		err = -EINVAL; -		goto out; +		goto out_put;  	}  	err = security_task_movememory(task);  	if (err) +		goto out_put; + +	mm = get_task_mm(task); +	put_task_struct(task); + +	if (!mm) { +		err = -EINVAL;  		goto out; +	}  	err = do_migrate_pages(mm, old, new,  		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); + +	mmput(mm);  out: -	if (mm) -		mmput(mm);  	NODEMASK_SCRATCH_FREE(scratch);  	return err; + +out_put: +	put_task_struct(task); +	goto out; +  } @@ -1393,10 +1526,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,  #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, -				     compat_ulong_t __user *nmask, -				     compat_ulong_t maxnode, -				     compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, +		       compat_ulong_t __user *, nmask, +		       compat_ulong_t, maxnode, +		       compat_ulong_t, addr, compat_ulong_t, flags)  {  	long err;  	unsigned long __user *nm = NULL; @@ -1412,7 +1545,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,  	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);  	if (!err && nmask) { -		err = copy_from_user(bm, nm, alloc_size); +		unsigned long copy_size; +		copy_size = min_t(unsigned long, sizeof(bm), alloc_size); +		err = copy_from_user(bm, nm, copy_size);  		/* ensure entire bitmap is zeroed */  		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);  		err |= compat_put_bitmap(nmask, bm, nr_bits); @@ -1421,8 +1556,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,  	return err;  } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, -				     compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, +		       compat_ulong_t, maxnode)  {  	long err = 0;  	unsigned long __user *nm = NULL; @@ -1444,9 +1579,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,  	return sys_set_mempolicy(mode, nm, nr_bits+1);  } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, -			     compat_ulong_t mode, compat_ulong_t __user *nmask, -			     compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, +		       compat_ulong_t, mode, compat_ulong_t __user *, nmask, +		       compat_ulong_t, maxnode, compat_ulong_t, flags)  {  	long err = 0;  	unsigned long __user *nm = NULL; @@ -1472,24 +1607,23 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,  /*   * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma   - virtual memory area whose policy is sought - * @addr  - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup   *   * Returns effective policy for a VMA at specified address.   * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies - * are protected by the task's mmap_sem, which must be held for read by - * the caller. + * Current or other task's task mempolicy and non-shared vma policies must be + * protected by task_lock(task) by the caller.   * Shared policies [those marked as MPOL_F_SHARED] require an extra reference   * count--added by the get_policy() vm_op, as appropriate--to protect against   * freeing by another task.  It is the caller's responsibility to free the   * extra reference for shared policies.   */ -static struct mempolicy *get_vma_policy(struct task_struct *task, +struct mempolicy *get_vma_policy(struct task_struct *task,  		struct vm_area_struct *vma, unsigned long addr)  { -	struct mempolicy *pol = task->mempolicy; +	struct mempolicy *pol = get_task_policy(task);  	if (vma) {  		if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1497,14 +1631,68 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,  									addr);  			if (vpol)  				pol = vpol; -		} else if (vma->vm_policy) +		} else if (vma->vm_policy) {  			pol = vma->vm_policy; + +			/* +			 * shmem_alloc_page() passes MPOL_F_SHARED policy with +			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference +			 * count on these policies which will be dropped by +			 * mpol_cond_put() later +			 */ +			if (mpol_needs_cond_ref(pol)) +				mpol_get(pol); +		}  	}  	if (!pol)  		pol = &default_policy;  	return pol;  } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ +	struct mempolicy *pol = get_task_policy(task); +	if (vma) { +		if (vma->vm_ops && vma->vm_ops->get_policy) { +			bool ret = false; + +			pol = vma->vm_ops->get_policy(vma, vma->vm_start); +			if (pol && (pol->flags & MPOL_F_MOF)) +				ret = true; +			mpol_cond_put(pol); + +			return ret; +		} else if (vma->vm_policy) { +			pol = vma->vm_policy; +		} +	} + +	if (!pol) +		return default_policy.flags & MPOL_F_MOF; + +	return pol->flags & MPOL_F_MOF; +} + +static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ +	enum zone_type dynamic_policy_zone = policy_zone; + +	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + +	/* +	 * if policy->v.nodes has movable memory only, +	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. +	 * +	 * policy->v.nodes is intersect with node_states[N_MEMORY]. +	 * so if the following test faile, it implies +	 * policy->v.nodes has movable memory only. +	 */ +	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) +		dynamic_policy_zone = ZONE_MOVABLE; + +	return zone >= dynamic_policy_zone; +} +  /*   * Return a nodemask representing a mempolicy for filtering nodes for   * page allocation @@ -1513,7 +1701,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)  {  	/* Lower zones don't get a nodemask applied for MPOL_BIND */  	if (unlikely(policy->mode == MPOL_BIND) && -			gfp_zone(gfp) >= policy_zone && +			apply_policy_zone(policy, gfp_zone(gfp)) &&  			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))  		return &policy->v.nodes; @@ -1521,10 +1709,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)  }  /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, +	int nd)  { -	int nd = numa_node_id(); -  	switch (policy->mode) {  	case MPOL_PREFERRED:  		if (!(policy->flags & MPOL_F_LOCAL)) @@ -1565,15 +1752,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)  /*   * Depending on the memory policy provide a node from which to allocate the   * next slab entry. - * @policy must be protected by freeing by the caller.  If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy.  The system default policy requires no - * such protection.   */ -unsigned slab_node(struct mempolicy *policy) +unsigned int mempolicy_slab_node(void)  { +	struct mempolicy *policy; +	int node = numa_mem_id(); + +	if (in_interrupt()) +		return node; + +	policy = current->mempolicy;  	if (!policy || policy->flags & MPOL_F_LOCAL) -		return numa_node_id(); +		return node;  	switch (policy->mode) {  	case MPOL_PREFERRED: @@ -1593,11 +1783,11 @@ unsigned slab_node(struct mempolicy *policy)  		struct zonelist *zonelist;  		struct zone *zone;  		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); -		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; +		zonelist = &NODE_DATA(node)->node_zonelists[0];  		(void)first_zones_zonelist(zonelist, highest_zoneidx,  							&policy->v.nodes,  							&zone); -		return zone ? zone->node : numa_node_id(); +		return zone ? zone->node : node;  	}  	default: @@ -1612,7 +1802,7 @@ static unsigned offset_il_node(struct mempolicy *pol,  	unsigned nnodes = nodes_weight(pol->v.nodes);  	unsigned target;  	int c; -	int nid = -1; +	int nid = NUMA_NO_NODE;  	if (!nnodes)  		return numa_node_id(); @@ -1647,21 +1837,36 @@ static inline unsigned interleave_nid(struct mempolicy *pol,  		return interleave_nodes(pol);  } +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ +	int w, bit = NUMA_NO_NODE; + +	w = nodes_weight(*maskp); +	if (w) +		bit = bitmap_ord_to_pos(maskp->bits, +			get_random_int() % w, MAX_NUMNODES); +	return bit; +} +  #ifdef CONFIG_HUGETLBFS  /*   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask   *   * Returns a zonelist suitable for a huge page allocation and a pointer   * to the struct mempolicy for conditional unref after allocation.   * If the effective policy is 'BIND, returns a pointer to the mempolicy's   * @nodemask for filtering the zonelist.   * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin()   */  struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,  				gfp_t gfp_flags, struct mempolicy **mpol, @@ -1676,7 +1881,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,  		zl = node_zonelist(interleave_nid(*mpol, vma, addr,  				huge_page_shift(hstate_vma(vma))), gfp_flags);  	} else { -		zl = policy_zonelist(gfp_flags, *mpol); +		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());  		if ((*mpol)->mode == MPOL_BIND)  			*nodemask = &(*mpol)->v.nodes;  	} @@ -1793,7 +1998,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,  }  /** - * 	alloc_page_vma	- Allocate a page for a VMA. + * 	alloc_pages_vma	- Allocate a page for a VMA.   *   * 	@gfp:   *      %GFP_USER    user allocation. @@ -1802,6 +2007,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,   *      %GFP_FS      allocation should not call back into a file system.   *      %GFP_ATOMIC  don't sleep.   * + *	@order:Order of the GFP allocation.   * 	@vma:  Pointer to VMA or NULL if not available.   *	@addr: Virtual Address of the allocation. Must be inside the VMA.   * @@ -1815,38 +2021,35 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,   *	Should be called with the mm_sem of the vma hold.   */  struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, +		unsigned long addr, int node)  { -	struct mempolicy *pol = get_vma_policy(current, vma, addr); -	struct zonelist *zl; +	struct mempolicy *pol;  	struct page *page; +	unsigned int cpuset_mems_cookie; + +retry_cpuset: +	pol = get_vma_policy(current, vma, addr); +	cpuset_mems_cookie = read_mems_allowed_begin(); -	get_mems_allowed();  	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {  		unsigned nid; -		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); +		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);  		mpol_cond_put(pol); -		page = alloc_page_interleave(gfp, 0, nid); -		put_mems_allowed(); +		page = alloc_page_interleave(gfp, order, nid); +		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) +			goto retry_cpuset; +  		return page;  	} -	zl = policy_zonelist(gfp, pol); -	if (unlikely(mpol_needs_cond_ref(pol))) { -		/* -		 * slow path: ref counted shared policy -		 */ -		struct page *page =  __alloc_pages_nodemask(gfp, 0, -						zl, policy_nodemask(gfp, pol)); +	page = __alloc_pages_nodemask(gfp, order, +				      policy_zonelist(gfp, pol, node), +				      policy_nodemask(gfp, pol)); +	if (unlikely(mpol_needs_cond_ref(pol)))  		__mpol_put(pol); -		put_mems_allowed(); -		return page; -	} -	/* -	 * fast path:  default or task policy -	 */ -	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); -	put_mems_allowed(); +	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) +		goto retry_cpuset;  	return page;  } @@ -1871,13 +2074,16 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)   */  struct page *alloc_pages_current(gfp_t gfp, unsigned order)  { -	struct mempolicy *pol = current->mempolicy; +	struct mempolicy *pol = get_task_policy(current);  	struct page *page; +	unsigned int cpuset_mems_cookie;  	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))  		pol = &default_policy; -	get_mems_allowed(); +retry_cpuset: +	cpuset_mems_cookie = read_mems_allowed_begin(); +  	/*  	 * No reference counting needed for current->mempolicy  	 * nor system default_policy @@ -1886,12 +2092,26 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)  		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));  	else  		page = __alloc_pages_nodemask(gfp, order, -			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); -	put_mems_allowed(); +				policy_zonelist(gfp, pol, numa_node_id()), +				policy_nodemask(gfp, pol)); + +	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) +		goto retry_cpuset; +  	return page;  }  EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ +	struct mempolicy *pol = mpol_dup(vma_policy(src)); + +	if (IS_ERR(pol)) +		return PTR_ERR(pol); +	dst->vm_policy = pol; +	return 0; +} +  /*   * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it   * rebinds the mempolicy its copying by calling mpol_rebind_policy() @@ -1919,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)  	} else  		*new = *old; -	rcu_read_lock();  	if (current_cpuset_is_being_rebound()) {  		nodemask_t mems = cpuset_mems_allowed(current);  		if (new->flags & MPOL_F_REBINDING) @@ -1927,57 +2146,33 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)  		else  			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);  	} -	rcu_read_unlock();  	atomic_set(&new->refcnt, 1);  	return new;  } -/* - * If *frompol needs [has] an extra ref, copy *frompol to *tompol , - * eliminate the * MPOL_F_* flags that require conditional ref and - * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly - * after return.  Use the returned value. - * - * Allows use of a mempolicy for, e.g., multiple allocations with a single - * policy lookup, even if the policy needs/has extra ref on lookup. - * shmem_readahead needs this. - */ -struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, -						struct mempolicy *frompol) -{ -	if (!mpol_needs_cond_ref(frompol)) -		return frompol; - -	*tompol = *frompol; -	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */ -	__mpol_put(frompol); -	return tompol; -} -  /* Slow path of a mempolicy comparison */ -int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)  {  	if (!a || !b) -		return 0; +		return false;  	if (a->mode != b->mode) -		return 0; +		return false;  	if (a->flags != b->flags) -		return 0; +		return false;  	if (mpol_store_user_nodemask(a))  		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) -			return 0; +			return false;  	switch (a->mode) {  	case MPOL_BIND:  		/* Fall through */  	case MPOL_INTERLEAVE: -		return nodes_equal(a->v.nodes, b->v.nodes); +		return !!nodes_equal(a->v.nodes, b->v.nodes);  	case MPOL_PREFERRED: -		return a->v.preferred_node == b->v.preferred_node && -			a->flags == b->flags; +		return a->v.preferred_node == b->v.preferred_node;  	default:  		BUG(); -		return 0; +		return false;  	}  } @@ -2065,26 +2260,132 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)  	return pol;  } +static void sp_free(struct sp_node *n) +{ +	mpol_put(n->policy); +	kmem_cache_free(sn_cache, n); +} + +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + *	-1	- not misplaced, page is in the right node + *	node	- node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ +	struct mempolicy *pol; +	struct zone *zone; +	int curnid = page_to_nid(page); +	unsigned long pgoff; +	int thiscpu = raw_smp_processor_id(); +	int thisnid = cpu_to_node(thiscpu); +	int polnid = -1; +	int ret = -1; + +	BUG_ON(!vma); + +	pol = get_vma_policy(current, vma, addr); +	if (!(pol->flags & MPOL_F_MOF)) +		goto out; + +	switch (pol->mode) { +	case MPOL_INTERLEAVE: +		BUG_ON(addr >= vma->vm_end); +		BUG_ON(addr < vma->vm_start); + +		pgoff = vma->vm_pgoff; +		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; +		polnid = offset_il_node(pol, vma, pgoff); +		break; + +	case MPOL_PREFERRED: +		if (pol->flags & MPOL_F_LOCAL) +			polnid = numa_node_id(); +		else +			polnid = pol->v.preferred_node; +		break; + +	case MPOL_BIND: +		/* +		 * allows binding to multiple nodes. +		 * use current page if in policy nodemask, +		 * else select nearest allowed node, if any. +		 * If no allowed nodes, use current [!misplaced]. +		 */ +		if (node_isset(curnid, pol->v.nodes)) +			goto out; +		(void)first_zones_zonelist( +				node_zonelist(numa_node_id(), GFP_HIGHUSER), +				gfp_zone(GFP_HIGHUSER), +				&pol->v.nodes, &zone); +		polnid = zone->node; +		break; + +	default: +		BUG(); +	} + +	/* Migrate the page towards the node whose CPU is referencing it */ +	if (pol->flags & MPOL_F_MORON) { +		polnid = thisnid; + +		if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) +			goto out; +	} + +	if (curnid != polnid) +		ret = polnid; +out: +	mpol_cond_put(pol); + +	return ret; +} +  static void sp_delete(struct shared_policy *sp, struct sp_node *n)  {  	pr_debug("deleting %lx-l%lx\n", n->start, n->end);  	rb_erase(&n->nd, &sp->root); -	mpol_put(n->policy); -	kmem_cache_free(sn_cache, n); +	sp_free(n); +} + +static void sp_node_init(struct sp_node *node, unsigned long start, +			unsigned long end, struct mempolicy *pol) +{ +	node->start = start; +	node->end = end; +	node->policy = pol;  }  static struct sp_node *sp_alloc(unsigned long start, unsigned long end,  				struct mempolicy *pol)  { -	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); +	struct sp_node *n; +	struct mempolicy *newpol; +	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);  	if (!n)  		return NULL; -	n->start = start; -	n->end = end; -	mpol_get(pol); -	pol->flags |= MPOL_F_SHARED;	/* for unref */ -	n->policy = pol; + +	newpol = mpol_dup(pol); +	if (IS_ERR(newpol)) { +		kmem_cache_free(sn_cache, n); +		return NULL; +	} +	newpol->flags |= MPOL_F_SHARED; +	sp_node_init(n, start, end, newpol); +  	return n;  } @@ -2092,7 +2393,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,  static int shared_policy_replace(struct shared_policy *sp, unsigned long start,  				 unsigned long end, struct sp_node *new)  { -	struct sp_node *n, *new2 = NULL; +	struct sp_node *n; +	struct sp_node *n_new = NULL; +	struct mempolicy *mpol_new = NULL; +	int ret = 0;  restart:  	spin_lock(&sp->lock); @@ -2108,16 +2412,16 @@ restart:  		} else {  			/* Old policy spanning whole new range. */  			if (n->end > end) { -				if (!new2) { -					spin_unlock(&sp->lock); -					new2 = sp_alloc(end, n->end, n->policy); -					if (!new2) -						return -ENOMEM; -					goto restart; -				} +				if (!n_new) +					goto alloc_new; + +				*mpol_new = *n->policy; +				atomic_set(&mpol_new->refcnt, 1); +				sp_node_init(n_new, end, n->end, mpol_new);  				n->end = start; -				sp_insert(sp, new2); -				new2 = NULL; +				sp_insert(sp, n_new); +				n_new = NULL; +				mpol_new = NULL;  				break;  			} else  				n->end = start; @@ -2129,11 +2433,26 @@ restart:  	if (new)  		sp_insert(sp, new);  	spin_unlock(&sp->lock); -	if (new2) { -		mpol_put(new2->policy); -		kmem_cache_free(sn_cache, new2); -	} -	return 0; +	ret = 0; + +err_out: +	if (mpol_new) +		mpol_put(mpol_new); +	if (n_new) +		kmem_cache_free(sn_cache, n_new); + +	return ret; + +alloc_new: +	spin_unlock(&sp->lock); +	ret = -ENOMEM; +	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); +	if (!n_new) +		goto err_out; +	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); +	if (!mpol_new) +		goto err_out; +	goto restart;  }  /** @@ -2196,7 +2515,7 @@ int mpol_set_shared_policy(struct shared_policy *info,  		 vma->vm_pgoff,  		 sz, npol ? npol->mode : -1,  		 npol ? npol->flags : -1, -		 npol ? nodes_addr(npol->v.nodes)[0] : -1); +		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);  	if (npol) {  		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -2205,7 +2524,7 @@ int mpol_set_shared_policy(struct shared_policy *info,  	}  	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);  	if (err && new) -		kmem_cache_free(sn_cache, new); +		sp_free(new);  	return err;  } @@ -2222,13 +2541,60 @@ void mpol_free_shared_policy(struct shared_policy *p)  	while (next) {  		n = rb_entry(next, struct sp_node, nd);  		next = rb_next(&n->nd); -		rb_erase(&n->nd, &p->root); -		mpol_put(n->policy); -		kmem_cache_free(sn_cache, n); +		sp_delete(p, n);  	}  	spin_unlock(&p->lock);  } +#ifdef CONFIG_NUMA_BALANCING +static int __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ +	bool numabalancing_default = false; + +	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) +		numabalancing_default = true; + +	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ +	if (numabalancing_override) +		set_numabalancing_state(numabalancing_override == 1); + +	if (nr_node_ids > 1 && !numabalancing_override) { +		pr_info("%s automatic NUMA balancing. " +			"Configure with numa_balancing= or the " +			"kernel.numa_balancing sysctl", +			numabalancing_default ? "Enabling" : "Disabling"); +		set_numabalancing_state(numabalancing_default); +	} +} + +static int __init setup_numabalancing(char *str) +{ +	int ret = 0; +	if (!str) +		goto out; + +	if (!strcmp(str, "enable")) { +		numabalancing_override = 1; +		ret = 1; +	} else if (!strcmp(str, "disable")) { +		numabalancing_override = -1; +		ret = 1; +	} +out: +	if (!ret) +		pr_warn("Unable to parse numa_balancing=\n"); + +	return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ +  /* assumes fs == KERNEL_DS */  void __init numa_policy_init(void)  { @@ -2244,13 +2610,22 @@ void __init numa_policy_init(void)  				     sizeof(struct sp_node),  				     0, SLAB_PANIC, NULL); +	for_each_node(nid) { +		preferred_node_policy[nid] = (struct mempolicy) { +			.refcnt = ATOMIC_INIT(1), +			.mode = MPOL_PREFERRED, +			.flags = MPOL_F_MOF | MPOL_F_MORON, +			.v = { .preferred_node = nid, }, +		}; +	} +  	/*  	 * Set interleaving policy for system init. Interleaving is only  	 * enabled across suitably sized nodes (default is >= 16MB), or  	 * fall back to the largest node if they're all smaller.  	 */  	nodes_clear(interleave_nodes); -	for_each_node_state(nid, N_HIGH_MEMORY) { +	for_each_node_state(nid, N_MEMORY) {  		unsigned long total_pages = node_present_pages(nid);  		/* Preserve the largest node */ @@ -2269,7 +2644,9 @@ void __init numa_policy_init(void)  		node_set(prefer, interleave_nodes);  	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) -		printk("numa_policy_init: interleaving failed\n"); +		pr_err("%s: interleaving failed\n", __func__); + +	check_numabalancing_enable();  }  /* Reset policy of current process to default */ @@ -2283,44 +2660,34 @@ void numa_default_policy(void)   */  /* - * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.   */ -#define MPOL_LOCAL MPOL_MAX  static const char * const policy_modes[] =  {  	[MPOL_DEFAULT]    = "default",  	[MPOL_PREFERRED]  = "prefer",  	[MPOL_BIND]       = "bind",  	[MPOL_INTERLEAVE] = "interleave", -	[MPOL_LOCAL]      = "local" +	[MPOL_LOCAL]      = "local",  };  #ifdef CONFIG_TMPFS  /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.   * @str:  string containing mempolicy to parse   * @mpol:  pointer to struct mempolicy pointer, returned on success. - * @no_context:  flag whether to "contextualize" the mempolicy   *   * Format of input:   *	<mode>[=<flags>][:<nodelist>]   * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy.  This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol - * mount option.  Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved.  Saving - * it again is redundant, but safe. - *   * On success, returns 0, else 1   */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol)  {  	struct mempolicy *new = NULL;  	unsigned short mode; -	unsigned short uninitialized_var(mode_flags); +	unsigned short mode_flags;  	nodemask_t nodes;  	char *nodelist = strchr(str, ':');  	char *flags = strchr(str, '='); @@ -2331,7 +2698,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  		*nodelist++ = '\0';  		if (nodelist_parse(nodelist, nodes))  			goto out; -		if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) +		if (!nodes_subset(nodes, node_states[N_MEMORY]))  			goto out;  	} else  		nodes_clear(nodes); @@ -2339,12 +2706,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  	if (flags)  		*flags++ = '\0';	/* terminate mode string */ -	for (mode = 0; mode <= MPOL_LOCAL; mode++) { +	for (mode = 0; mode < MPOL_MAX; mode++) {  		if (!strcmp(str, policy_modes[mode])) {  			break;  		}  	} -	if (mode > MPOL_LOCAL) +	if (mode >= MPOL_MAX)  		goto out;  	switch (mode) { @@ -2365,7 +2732,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  		 * Default to online nodes with memory if no nodelist  		 */  		if (!nodelist) -			nodes = node_states[N_HIGH_MEMORY]; +			nodes = node_states[N_MEMORY];  		break;  	case MPOL_LOCAL:  		/* @@ -2408,24 +2775,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)  	if (IS_ERR(new))  		goto out; -	if (no_context) { -		/* save for contextualization */ -		new->w.user_nodemask = nodes; -	} else { -		int ret; -		NODEMASK_SCRATCH(scratch); -		if (scratch) { -			task_lock(current); -			ret = mpol_set_nodemask(new, &nodes, scratch); -			task_unlock(current); -		} else -			ret = -ENOMEM; -		NODEMASK_SCRATCH_FREE(scratch); -		if (ret) { -			mpol_put(new); -			goto out; -		} -	} +	/* +	 * Save nodes for mpol_to_str() to show the tmpfs mount options +	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. +	 */ +	if (mode != MPOL_PREFERRED) +		new->v.nodes = nodes; +	else if (nodelist) +		new->v.preferred_node = first_node(nodes); +	else +		new->flags |= MPOL_F_LOCAL; + +	/* +	 * Save nodes for contextualization: this will be used to "clone" +	 * the mempolicy in a specific context [cpuset] at a later time. +	 */ +	new->w.user_nodemask = nodes; +  	err = 0;  out: @@ -2445,67 +2811,46 @@ out:   * @buffer:  to contain formatted mempolicy string   * @maxlen:  length of @buffer   * @pol:  pointer to mempolicy to be formatted - * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask   * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string.  If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids.   */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)  {  	char *p = buffer; -	int l; -	nodemask_t nodes; -	unsigned short mode; -	unsigned short flags = pol ? pol->flags : 0; - -	/* -	 * Sanity check:  room for longest mode, flag and some nodes -	 */ -	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); +	nodemask_t nodes = NODE_MASK_NONE; +	unsigned short mode = MPOL_DEFAULT; +	unsigned short flags = 0; -	if (!pol || pol == &default_policy) -		mode = MPOL_DEFAULT; -	else +	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {  		mode = pol->mode; +		flags = pol->flags; +	}  	switch (mode) {  	case MPOL_DEFAULT: -		nodes_clear(nodes);  		break; -  	case MPOL_PREFERRED: -		nodes_clear(nodes);  		if (flags & MPOL_F_LOCAL) -			mode = MPOL_LOCAL;	/* pseudo-policy */ +			mode = MPOL_LOCAL;  		else  			node_set(pol->v.preferred_node, nodes);  		break; -  	case MPOL_BIND: -		/* Fall through */  	case MPOL_INTERLEAVE: -		if (no_context) -			nodes = pol->w.user_nodemask; -		else -			nodes = pol->v.nodes; +		nodes = pol->v.nodes;  		break; -  	default: -		BUG(); +		WARN_ON_ONCE(1); +		snprintf(p, maxlen, "unknown"); +		return;  	} -	l = strlen(policy_modes[mode]); -	if (buffer + maxlen < p + l + 1) -		return -ENOSPC; - -	strcpy(p, policy_modes[mode]); -	p += l; +	p += snprintf(p, maxlen, "%s", policy_modes[mode]);  	if (flags & MPOL_MODE_FLAGS) { -		if (buffer + maxlen < p + 2) -			return -ENOSPC; -		*p++ = '='; +		p += snprintf(p, buffer + maxlen - p, "=");  		/*  		 * Currently, the only defined flags are mutually exclusive @@ -2517,166 +2862,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)  	}  	if (!nodes_empty(nodes)) { -		if (buffer + maxlen < p + 2) -			return -ENOSPC; -		*p++ = ':'; +		p += snprintf(p, buffer + maxlen - p, ":");  	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);  	} -	return p - buffer; -} - -struct numa_maps { -	unsigned long pages; -	unsigned long anon; -	unsigned long active; -	unsigned long writeback; -	unsigned long mapcount_max; -	unsigned long dirty; -	unsigned long swapcache; -	unsigned long node[MAX_NUMNODES]; -}; - -static void gather_stats(struct page *page, void *private, int pte_dirty) -{ -	struct numa_maps *md = private; -	int count = page_mapcount(page); - -	md->pages++; -	if (pte_dirty || PageDirty(page)) -		md->dirty++; - -	if (PageSwapCache(page)) -		md->swapcache++; - -	if (PageActive(page) || PageUnevictable(page)) -		md->active++; - -	if (PageWriteback(page)) -		md->writeback++; - -	if (PageAnon(page)) -		md->anon++; - -	if (count > md->mapcount_max) -		md->mapcount_max = count; - -	md->node[page_to_nid(page)]++; -} - -#ifdef CONFIG_HUGETLB_PAGE -static void check_huge_range(struct vm_area_struct *vma, -		unsigned long start, unsigned long end, -		struct numa_maps *md) -{ -	unsigned long addr; -	struct page *page; -	struct hstate *h = hstate_vma(vma); -	unsigned long sz = huge_page_size(h); - -	for (addr = start; addr < end; addr += sz) { -		pte_t *ptep = huge_pte_offset(vma->vm_mm, -						addr & huge_page_mask(h)); -		pte_t pte; - -		if (!ptep) -			continue; - -		pte = *ptep; -		if (pte_none(pte)) -			continue; - -		page = pte_page(pte); -		if (!page) -			continue; - -		gather_stats(page, md, pte_dirty(*ptep)); -	} -} -#else -static inline void check_huge_range(struct vm_area_struct *vma, -		unsigned long start, unsigned long end, -		struct numa_maps *md) -{ -} -#endif - -/* - * Display pages allocated per node and memory policy via /proc. - */ -int show_numa_map(struct seq_file *m, void *v) -{ -	struct proc_maps_private *priv = m->private; -	struct vm_area_struct *vma = v; -	struct numa_maps *md; -	struct file *file = vma->vm_file; -	struct mm_struct *mm = vma->vm_mm; -	struct mempolicy *pol; -	int n; -	char buffer[50]; - -	if (!mm) -		return 0; - -	md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); -	if (!md) -		return 0; - -	pol = get_vma_policy(priv->task, vma, vma->vm_start); -	mpol_to_str(buffer, sizeof(buffer), pol, 0); -	mpol_cond_put(pol); - -	seq_printf(m, "%08lx %s", vma->vm_start, buffer); - -	if (file) { -		seq_printf(m, " file="); -		seq_path(m, &file->f_path, "\n\t= "); -	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { -		seq_printf(m, " heap"); -	} else if (vma->vm_start <= mm->start_stack && -			vma->vm_end >= mm->start_stack) { -		seq_printf(m, " stack"); -	} - -	if (is_vm_hugetlb_page(vma)) { -		check_huge_range(vma, vma->vm_start, vma->vm_end, md); -		seq_printf(m, " huge"); -	} else { -		check_pgd_range(vma, vma->vm_start, vma->vm_end, -			&node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); -	} - -	if (!md->pages) -		goto out; - -	if (md->anon) -		seq_printf(m," anon=%lu",md->anon); - -	if (md->dirty) -		seq_printf(m," dirty=%lu",md->dirty); - -	if (md->pages != md->anon && md->pages != md->dirty) -		seq_printf(m, " mapped=%lu", md->pages); - -	if (md->mapcount_max > 1) -		seq_printf(m, " mapmax=%lu", md->mapcount_max); - -	if (md->swapcache) -		seq_printf(m," swapcache=%lu", md->swapcache); - -	if (md->active < md->pages && !is_vm_hugetlb_page(vma)) -		seq_printf(m," active=%lu", md->active); - -	if (md->writeback) -		seq_printf(m," writeback=%lu", md->writeback); - -	for_each_node_state(n, N_HIGH_MEMORY) -		if (md->node[n]) -			seq_printf(m, " N%d=%lu", n, md->node[n]); -out: -	seq_putc(m, '\n'); -	kfree(md); - -	if (m->count < m->size) -		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; -	return 0;  }  | 
