--- zzzz-none-000/linux-3.10.107/mm/mempolicy.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/mm/mempolicy.c 2021-02-04 17:41:59.000000000 +0000 @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -91,6 +93,7 @@ #include #include #include +#include #include #include @@ -120,22 +123,23 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; -static struct mempolicy *get_task_policy(struct task_struct *p) +struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; - if (!pol) { - node = numa_node_id(); - if (node != NUMA_NO_NODE) - pol = &preferred_node_policy[node]; + if (pol) + return pol; + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (pol->mode) + return pol; } - return pol; + return &default_policy; } static const struct mempolicy_operations { @@ -158,12 +162,6 @@ enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; -/* Check that the nodemask contains at least one populated zone */ -static int is_valid_nodemask(const nodemask_t *nodemask) -{ - return nodes_intersects(*nodemask, node_states[N_MEMORY]); -} - static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; @@ -198,7 +196,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { - if (!is_valid_nodemask(nodes)) + if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; @@ -230,7 +228,7 @@ nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); @@ -473,21 +471,34 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + struct vm_area_struct *prev; +}; + +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; + struct vm_area_struct *vma = walk->vma; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; pte_t *pte; spinlock_t *ptl; - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - int nid; + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); @@ -500,80 +511,50 @@ if (PageReserved(page)) continue; nid = page_to_nid(page); - if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, private, flags); - else - break; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(orig_pte, ptl); - return addr != end; -} - -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma, addr, pmd); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; - if (check_pte_range(vma, pmd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pmd++, addr = next, addr != end); - return 0; -} - -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - if (check_pmd_range(vma, pud, addr, next, nodes, - flags, private)) - return -EIO; - } while (pud++, addr = next, addr != end); + migrate_page_add(page, qp->pagelist, flags); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - void *private) -{ - pgd_t *pgd; - unsigned long next; +static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; + struct page *page; + spinlock_t *ptl; + pte_t entry; - pgd = pgd_offset(vma->vm_mm, addr); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - if (check_pud_range(vma, pgd, addr, next, nodes, - flags, private)) - return -EIO; - } while (pgd++, addr = next, addr != end); + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); + nid = page_to_nid(page); + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, qp->pagelist); +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -587,9 +568,8 @@ unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); @@ -601,60 +581,75 @@ { return 0; } -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#endif /* CONFIG_NUMA_BALANCING */ -/* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. - */ -static int -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, void *private) +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) { - int err = 0; - struct vm_area_struct *vma, *prev; - - vma = find_vma(mm, start); - if (!vma) - return -EFAULT; - prev = NULL; - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - unsigned long endvma = vma->vm_end; - - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; + struct vm_area_struct *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (prev && prev->vm_end < vma->vm_start) - return -EFAULT; - } + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } - if (is_vm_hugetlb_page(vma)) - goto next; + qp->prev = vma; - if (flags & MPOL_MF_LAZY) { + if (flags & MPOL_MF_LAZY) { + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) change_prot_numa(vma, start, endvma); - goto next; - } + return 1; + } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) { + if ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma))) + /* queue pages from current vma */ + return 0; + return 1; +} - err = check_pgd_range(vma, start, endvma, nodes, - flags, private); - if (err) - break; - } -next: - prev = vma; - } - return err; +/* + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) + */ +static int +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) +{ + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .prev = NULL, + }; + struct mm_walk queue_pages_walk = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, + .mm = mm, + .private = &qp, + }; + + return walk_page_range(start, end, &queue_pages_walk); } /* @@ -724,8 +719,8 @@ pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; @@ -754,42 +749,11 @@ return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; - struct mm_struct *mm = current->mm; NODEMASK_SCRATCH(scratch); int ret; @@ -801,33 +765,20 @@ ret = PTR_ERR(new); goto out; } - /* - * prevent changing our mempolicy while show_numa_maps() - * is using it. - * Note: do_set_mempolicy() can be called at init time - * with no 'mm'. - */ - if (mm) - down_write(&mm->mmap_sem); + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); task_unlock(current); - if (mm) - up_write(&mm->mmap_sem); - mpol_put(old); ret = 0; out: @@ -987,7 +938,12 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE, 0); } /* @@ -1010,14 +966,14 @@ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; @@ -1042,10 +998,6 @@ down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from, to, flags); - if (err) - goto out; - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1080,7 +1032,7 @@ tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1115,7 +1067,7 @@ if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1125,7 +1077,6 @@ if (err < 0) break; } -out: up_read(&mm->mmap_sem); if (err < 0) return err; @@ -1153,6 +1104,10 @@ vma = vma->vm_next; } + if (PageHuge(page)) { + BUG_ON(!vma); + return alloc_huge_page_noerr(vma, address, 1); + } /* * if !vma, alloc_page_vma() will use task or system default policy */ @@ -1246,7 +1201,7 @@ if (err) goto mpol_out; - err = check_range(mm, start, end, nmask, + err = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); if (!err) err = mbind_range(mm, start, end, new); @@ -1256,16 +1211,16 @@ if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_page, + nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); up_write(&mm->mmap_sem); mpol_out: @@ -1341,7 +1296,7 @@ } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; @@ -1362,7 +1317,7 @@ } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; @@ -1504,10 +1459,10 @@ #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1534,8 +1489,8 @@ return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; @@ -1555,9 +1510,9 @@ return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; @@ -1579,32 +1534,14 @@ #endif -/* - * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup - * - * Returns effective policy for a VMA at specified address. - * Falls back to @task or system default policy, as necessary. - * Current or other task's task mempolicy and non-shared vma policies must be - * protected by task_lock(task) by the caller. - * Shared policies [those marked as MPOL_F_SHARED] require an extra reference - * count--added by the get_policy() vm_op, as appropriate--to protect against - * freeing by another task. It is the caller's responsibility to free the - * extra reference for shared policies. - */ -struct mempolicy *get_vma_policy(struct task_struct *task, - struct vm_area_struct *vma, unsigned long addr) +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) { - struct mempolicy *pol = get_task_policy(task); + struct mempolicy *pol = NULL; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { - struct mempolicy *vpol = vma->vm_ops->get_policy(vma, - addr); - if (vpol) - pol = vpol; + pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) { pol = vma->vm_policy; @@ -1618,11 +1555,55 @@ mpol_get(pol); } } + + return pol; +} + +/* + * get_vma_policy(@vma, @addr) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to current->mempolicy or system default policy, as necessary. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = __get_vma_policy(vma, addr); + if (!pol) - pol = &default_policy; + pol = get_task_policy(current); + return pol; } +bool vma_policy_mof(struct vm_area_struct *vma) +{ + struct mempolicy *pol; + + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } + + pol = vma->vm_policy; + if (!pol) + pol = get_task_policy(current); + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -1702,21 +1683,18 @@ /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1736,11 +1714,11 @@ struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: @@ -1755,7 +1733,7 @@ unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1792,11 +1770,11 @@ /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) @@ -1808,18 +1786,18 @@ #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1827,7 +1805,7 @@ { struct zonelist *zl; - *mpol = get_vma_policy(current, vma, addr); + *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { @@ -1963,45 +1941,71 @@ * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. * When VMA is not NULL caller must hold down_read on the mmap_sem of the * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into - * user space. Returns NULL when no page can be allocated. - * - * Should be called with the mm_sem of the vma hold. + * all allocations for pages that will be mapped into user space. Returns + * NULL when no page can be allocated. */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node) + unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; unsigned int cpuset_mems_cookie; + struct zonelist *zl; + nodemask_t *nmask; retry_cpuset: - pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + pol = get_vma_policy(vma, addr); + cpuset_mems_cookie = read_mems_allowed_begin(); - if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; + goto out; + } - return page; + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED && + !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + page = __alloc_pages_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } } - page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, node), - policy_nodemask(gfp, pol)); - if (unlikely(mpol_needs_cond_ref(pol))) - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + + nmask = policy_nodemask(gfp, pol); + zl = policy_zonelist(gfp, pol, node); + page = __alloc_pages_nodemask(gfp, order, zl, nmask); + mpol_cond_put(pol); +out: + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2027,15 +2031,15 @@ */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = get_task_policy(current); + struct mempolicy *pol = &default_policy; struct page *page; unsigned int cpuset_mems_cookie; - if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) - pol = &default_policy; + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2048,13 +2052,23 @@ policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() @@ -2212,9 +2226,9 @@ /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. @@ -2232,12 +2246,14 @@ struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; BUG_ON(!vma); - pol = get_vma_policy(current, vma, addr); + pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; @@ -2280,33 +2296,9 @@ /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + polnid = thisnid; - polnid = numa_node_id(); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } @@ -2512,7 +2504,7 @@ } #ifdef CONFIG_NUMA_BALANCING -static bool __initdata numabalancing_override; +static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { @@ -2521,9 +2513,15 @@ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; - if (nr_node_ids > 1 && !numabalancing_override) { - printk(KERN_INFO "Enabling automatic NUMA balancing. " - "Configure with numa_balancing= or sysctl"); + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + + if (num_online_nodes() > 1 && !numabalancing_override) { + pr_info("%s automatic NUMA balancing. " + "Configure with numa_balancing= or the " + "kernel.numa_balancing sysctl", + numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } @@ -2533,18 +2531,17 @@ int ret = 0; if (!str) goto out; - numabalancing_override = true; if (!strcmp(str, "enable")) { - set_numabalancing_state(true); + numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { - set_numabalancing_state(false); + numabalancing_override = -1; ret = 1; } out: if (!ret) - printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + pr_warn("Unable to parse numa_balancing=\n"); return ret; } @@ -2604,7 +2601,7 @@ node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } @@ -2772,62 +2769,45 @@ * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); - - if (!pol || pol == &default_policy || (pol->flags & MPOL_F_MORON)) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2838,11 +2818,7 @@ p += snprintf(p, buffer + maxlen - p, "relative"); } - if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; - p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); - } - return p - buffer; + if (!nodes_empty(nodes)) + p += scnprintf(p, buffer + maxlen - p, ":%*pbl", + nodemask_pr_args(&nodes)); }