diff options
Diffstat (limited to 'mm/mempolicy.c')
| -rw-r--r-- | mm/mempolicy.c | 475 |
1 files changed, 246 insertions, 229 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2df1c1fb41..8f5330d74f4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -26,7 +26,7 @@ * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. - * As a special case node -1 here means do the allocation + * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/highmem.h> @@ -91,6 +93,7 @@ #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> +#include <linux/printk.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -123,16 +126,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; static struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; - int node; if (!pol) { - node = numa_node_id(); - if (node != -1) - pol = &preferred_node_policy[node]; + int node = numa_node_id(); - /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* + * preferred_node_policy is not initialised early in + * boot + */ + if (!pol->mode) + pol = NULL; + } } return pol; @@ -161,19 +167,7 @@ static const struct mempolicy_operations { /* Check that the nodemask contains at least one populated zone */ static int is_valid_nodemask(const nodemask_t *nodemask) { - int nd, k; - - for_each_node_mask(nd, *nodemask) { - struct zone *z; - - for (k = 0; k <= policy_zone; k++) { - z = &NODE_DATA(nd)->node_zones[k]; - if (z->present_pages > 0) - return 1; - } - } - - return 0; + return nodes_intersects(*nodemask, node_states[N_MEMORY]); } static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -270,7 +264,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); + mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) @@ -485,8 +479,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -508,9 +505,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. - * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page) || PageKsm(page)) + if (PageReserved(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -525,7 +521,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, + pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, + void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE + int nid; + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); + entry = huge_ptep_get((pte_t *)pmd); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, private); +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif +} + +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -536,17 +561,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + continue; + if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { + queue_pages_hugetlb_pmd_range(vma, pmd, nodes, + flags, private); + continue; + } split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes, + if (queue_pages_pte_range(vma, pmd, addr, next, nodes, flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -557,16 +589,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) + continue; if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes, + if (queue_pages_pmd_range(vma, pud, addr, next, nodes, flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -579,14 +613,14 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes, + if (queue_pages_pud_range(vma, pgd, addr, next, nodes, flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -600,7 +634,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); if (nr_updated) @@ -614,26 +647,27 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, { return 0; } -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#endif /* CONFIG_NUMA_BALANCING */ /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) */ -static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, +static int +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { - int err; - struct vm_area_struct *first, *vma, *prev; - + int err = 0; + struct vm_area_struct *vma, *prev; - first = find_vma(mm, start); - if (!first) - return ERR_PTR(-EFAULT); + vma = find_vma(mm, start); + if (!vma) + return -EFAULT; prev = NULL; - for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + for (; vma && vma->vm_start < end; vma = vma->vm_next) { unsigned long endvma = vma->vm_end; if (endvma > end) @@ -643,14 +677,11 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); + return -EFAULT; if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); + return -EFAULT; } - if (is_vm_hugetlb_page(vma)) - goto next; - if (flags & MPOL_MF_LAZY) { change_prot_numa(vma, start, endvma); goto next; @@ -660,17 +691,15 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && vma_migratable(vma))) { - err = check_pgd_range(vma, start, endvma, nodes, + err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); - if (err) { - first = ERR_PTR(err); + if (err) break; - } } next: prev = vma; } - return first; + return err; } /* @@ -745,7 +774,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (prev) { vma = prev; next = vma->vm_next; - continue; + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + /* vma_merge() joined vma && vma->next, case 8 */ + goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); @@ -757,6 +789,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } + replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; @@ -766,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -832,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); @@ -999,7 +1001,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1022,15 +1028,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC, - MR_SYSCALL); + err = migrate_pages(&pagelist, new_node_page, NULL, dest, + MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; @@ -1093,7 +1098,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1128,7 +1133,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1148,16 +1153,17 @@ out: /* * Allocate a new page for page migration based on vma policy. - * Start assuming that page is mapped by vma pointed to by @private. + * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { - struct vm_area_struct *vma = (struct vm_area_struct *)private; + struct vm_area_struct *vma; unsigned long uninitialized_var(address); + vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) @@ -1165,6 +1171,10 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * vma = vma->vm_next; } + if (PageHuge(page)) { + BUG_ON(!vma); + return alloc_huge_page_noerr(vma, address, 1); + } /* * if !vma, alloc_page_vma() will use task or system default policy */ @@ -1183,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +static struct page *new_page(struct page *page, unsigned long start, int **x) { return NULL; } @@ -1193,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { - struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; @@ -1235,7 +1244,7 @@ static long do_mbind(unsigned long start, unsigned long len, pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : -1); + nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { @@ -1259,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, + err = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma)) + if (!err) err = mbind_range(mm, start, end, new); if (!err) { @@ -1271,18 +1278,16 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, - false, MIGRATE_SYNC, - MR_MEMPOLICY_MBIND); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); up_write(&mm->mmap_sem); mpol_out: @@ -1358,7 +1363,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; @@ -1379,7 +1384,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; @@ -1521,10 +1526,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1551,8 +1556,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1574,9 +1579,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -1602,9 +1607,9 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, /* * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. @@ -1644,6 +1649,50 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + +static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ + enum zone_type dynamic_policy_zone = policy_zone; + + BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + + /* + * if policy->v.nodes has movable memory only, + * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. + * + * policy->v.nodes is intersect with node_states[N_MEMORY]. + * so if the following test faile, it implies + * policy->v.nodes has movable memory only. + */ + if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + dynamic_policy_zone = ZONE_MOVABLE; + + return zone >= dynamic_policy_zone; +} + /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation @@ -1652,7 +1701,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && - gfp_zone(gfp) >= policy_zone && + apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; @@ -1703,21 +1752,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1737,11 +1783,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: @@ -1756,7 +1802,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1793,11 +1839,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) @@ -1809,18 +1855,18 @@ int node_random(const nodemask_t *maskp) #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1984,7 +2030,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -1992,7 +2038,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2002,7 +2048,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2036,7 +2082,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2049,13 +2095,23 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() @@ -2083,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); if (new->flags & MPOL_F_REBINDING) @@ -2091,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) else mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -2215,9 +2269,9 @@ static void sp_free(struct sp_node *n) /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. @@ -2235,6 +2289,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2283,33 +2339,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; - - polnid = numa_node_id(); + polnid = thisnid; - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_nid = page_xchg_last_nid(page, polnid); - if (last_nid != polnid) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } @@ -2385,9 +2417,9 @@ restart: *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); - sp_node_init(n_new, n->end, end, mpol_new); - sp_insert(sp, n_new); + sp_node_init(n_new, end, n->end, mpol_new); n->end = start; + sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; @@ -2483,7 +2515,7 @@ int mpol_set_shared_policy(struct shared_policy *info, vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -2515,7 +2547,7 @@ void mpol_free_shared_policy(struct shared_policy *p) } #ifdef CONFIG_NUMA_BALANCING -static bool __initdata numabalancing_override; +static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { @@ -2524,9 +2556,15 @@ static void __init check_numabalancing_enable(void) if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or sysctl"); + pr_info("%s automatic NUMA balancing. " + "Configure with numa_balancing= or the " + "kernel.numa_balancing sysctl", + numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } @@ -2536,18 +2574,17 @@ static int __init setup_numabalancing(char *str) int ret = 0; if (!str) goto out; - numabalancing_override = true; if (!strcmp(str, "enable")) { - set_numabalancing_state(true); + numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { - set_numabalancing_state(false); + numabalancing_override = -1; ret = 1; } out: if (!ret) - printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + pr_warn("Unable to parse numa_balancing=\n"); return ret; } @@ -2607,7 +2644,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } @@ -2775,62 +2812,45 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); - - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2842,10 +2862,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) } if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; + p += snprintf(p, buffer + maxlen - p, ":"); p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } - return p - buffer; } |
