aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
authorWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
committerWe-unite <3205135446@qq.com>2025-03-08 22:04:20 +0800
commita07bb8fd1299070229f0e8f3dcb57ffd5ef9870a (patch)
tree84f21bd0bf7071bc5fc7dd989e77d7ceb5476682 /mm/mempolicy.c
downloadohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.tar.gz
ohosKernel-a07bb8fd1299070229f0e8f3dcb57ffd5ef9870a.zip
Initial commit: OpenHarmony-v4.0-ReleaseOpenHarmony-v4.0-Release
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c3050
1 files changed, 3050 insertions, 0 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
new file mode 100644
index 000000000..3f3677f96
--- /dev/null
+++ b/mm/mempolicy.c
@@ -0,0 +1,3050 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66*/
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/pagewalk.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/sched/mm.h>
77#include <linux/sched/numa_balancing.h>
78#include <linux/sched/task.h>
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
81#include <linux/slab.h>
82#include <linux/string.h>
83#include <linux/export.h>
84#include <linux/nsproxy.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
88#include <linux/ptrace.h>
89#include <linux/swap.h>
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
92#include <linux/migrate.h>
93#include <linux/ksm.h>
94#include <linux/rmap.h>
95#include <linux/security.h>
96#include <linux/syscalls.h>
97#include <linux/ctype.h>
98#include <linux/mm_inline.h>
99#include <linux/mmu_notifier.h>
100#include <linux/printk.h>
101#include <linux/swapops.h>
102
103#include <asm/tlbflush.h>
104#include <linux/uaccess.h>
105
106#include "internal.h"
107
108/* Internal flags */
109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
111
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
114
115/* Highest zone. An specific allocation for a zone below that is not
116 policied. */
117enum zone_type policy_zone = 0;
118
119/*
120 * run-time system-wide default policy => local allocation
121 */
122static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1), /* never free it */
124 .mode = MPOL_PREFERRED,
125 .flags = MPOL_F_LOCAL,
126};
127
128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
130/**
131 * numa_map_to_online_node - Find closest online node
132 * @node: Node id to start the search
133 *
134 * Lookup the next closest node by distance if @nid is not online.
135 */
136int numa_map_to_online_node(int node)
137{
138 int min_dist = INT_MAX, dist, n, min_node;
139
140 if (node == NUMA_NO_NODE || node_online(node))
141 return node;
142
143 min_node = node;
144 for_each_online_node(n) {
145 dist = node_distance(node, n);
146 if (dist < min_dist) {
147 min_dist = dist;
148 min_node = n;
149 }
150 }
151
152 return min_node;
153}
154EXPORT_SYMBOL_GPL(numa_map_to_online_node);
155
156struct mempolicy *get_task_policy(struct task_struct *p)
157{
158 struct mempolicy *pol = p->mempolicy;
159 int node;
160
161 if (pol)
162 return pol;
163
164 node = numa_node_id();
165 if (node != NUMA_NO_NODE) {
166 pol = &preferred_node_policy[node];
167 /* preferred_node_policy is not initialised early in boot */
168 if (pol->mode)
169 return pol;
170 }
171
172 return &default_policy;
173}
174
175static const struct mempolicy_operations {
176 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
177 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
178} mpol_ops[MPOL_MAX];
179
180static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
181{
182 return pol->flags & MPOL_MODE_FLAGS;
183}
184
185static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
186 const nodemask_t *rel)
187{
188 nodemask_t tmp;
189 nodes_fold(tmp, *orig, nodes_weight(*rel));
190 nodes_onto(*ret, tmp, *rel);
191}
192
193static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
194{
195 if (nodes_empty(*nodes))
196 return -EINVAL;
197 pol->v.nodes = *nodes;
198 return 0;
199}
200
201static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
202{
203 if (!nodes)
204 pol->flags |= MPOL_F_LOCAL; /* local allocation */
205 else if (nodes_empty(*nodes))
206 return -EINVAL; /* no allowed nodes */
207 else
208 pol->v.preferred_node = first_node(*nodes);
209 return 0;
210}
211
212static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
213{
214 if (nodes_empty(*nodes))
215 return -EINVAL;
216 pol->v.nodes = *nodes;
217 return 0;
218}
219
220/*
221 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
222 * any, for the new policy. mpol_new() has already validated the nodes
223 * parameter with respect to the policy mode and flags. But, we need to
224 * handle an empty nodemask with MPOL_PREFERRED here.
225 *
226 * Must be called holding task's alloc_lock to protect task's mems_allowed
227 * and mempolicy. May also be called holding the mmap_lock for write.
228 */
229static int mpol_set_nodemask(struct mempolicy *pol,
230 const nodemask_t *nodes, struct nodemask_scratch *nsc)
231{
232 int ret;
233
234 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
235 if (pol == NULL)
236 return 0;
237 /* Check N_MEMORY */
238 nodes_and(nsc->mask1,
239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
240
241 VM_BUG_ON(!nodes);
242 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
243 nodes = NULL; /* explicit local allocation */
244 else {
245 if (pol->flags & MPOL_F_RELATIVE_NODES)
246 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
247 else
248 nodes_and(nsc->mask2, *nodes, nsc->mask1);
249
250 if (mpol_store_user_nodemask(pol))
251 pol->w.user_nodemask = *nodes;
252 else
253 pol->w.cpuset_mems_allowed =
254 cpuset_current_mems_allowed;
255 }
256
257 if (nodes)
258 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
259 else
260 ret = mpol_ops[pol->mode].create(pol, NULL);
261 return ret;
262}
263
264/*
265 * This function just creates a new policy, does some check and simple
266 * initialization. You must invoke mpol_set_nodemask() to set nodes.
267 */
268static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
269 nodemask_t *nodes)
270{
271 struct mempolicy *policy;
272
273 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
274 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
275
276 if (mode == MPOL_DEFAULT) {
277 if (nodes && !nodes_empty(*nodes))
278 return ERR_PTR(-EINVAL);
279 return NULL;
280 }
281 VM_BUG_ON(!nodes);
282
283 /*
284 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
285 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
286 * All other modes require a valid pointer to a non-empty nodemask.
287 */
288 if (mode == MPOL_PREFERRED) {
289 if (nodes_empty(*nodes)) {
290 if (((flags & MPOL_F_STATIC_NODES) ||
291 (flags & MPOL_F_RELATIVE_NODES)))
292 return ERR_PTR(-EINVAL);
293 }
294 } else if (mode == MPOL_LOCAL) {
295 if (!nodes_empty(*nodes) ||
296 (flags & MPOL_F_STATIC_NODES) ||
297 (flags & MPOL_F_RELATIVE_NODES))
298 return ERR_PTR(-EINVAL);
299 mode = MPOL_PREFERRED;
300 } else if (nodes_empty(*nodes))
301 return ERR_PTR(-EINVAL);
302 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
303 if (!policy)
304 return ERR_PTR(-ENOMEM);
305 atomic_set(&policy->refcnt, 1);
306 policy->mode = mode;
307 policy->flags = flags;
308
309 return policy;
310}
311
312/* Slow path of a mpol destructor. */
313void __mpol_put(struct mempolicy *p)
314{
315 if (!atomic_dec_and_test(&p->refcnt))
316 return;
317 kmem_cache_free(policy_cache, p);
318}
319
320static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
321{
322}
323
324static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
325{
326 nodemask_t tmp;
327
328 if (pol->flags & MPOL_F_STATIC_NODES)
329 nodes_and(tmp, pol->w.user_nodemask, *nodes);
330 else if (pol->flags & MPOL_F_RELATIVE_NODES)
331 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
332 else {
333 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
334 *nodes);
335 pol->w.cpuset_mems_allowed = *nodes;
336 }
337
338 if (nodes_empty(tmp))
339 tmp = *nodes;
340
341 pol->v.nodes = tmp;
342}
343
344static void mpol_rebind_preferred(struct mempolicy *pol,
345 const nodemask_t *nodes)
346{
347 nodemask_t tmp;
348
349 if (pol->flags & MPOL_F_STATIC_NODES) {
350 int node = first_node(pol->w.user_nodemask);
351
352 if (node_isset(node, *nodes)) {
353 pol->v.preferred_node = node;
354 pol->flags &= ~MPOL_F_LOCAL;
355 } else
356 pol->flags |= MPOL_F_LOCAL;
357 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
358 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
359 pol->v.preferred_node = first_node(tmp);
360 } else if (!(pol->flags & MPOL_F_LOCAL)) {
361 pol->v.preferred_node = node_remap(pol->v.preferred_node,
362 pol->w.cpuset_mems_allowed,
363 *nodes);
364 pol->w.cpuset_mems_allowed = *nodes;
365 }
366}
367
368/*
369 * mpol_rebind_policy - Migrate a policy to a different set of nodes
370 *
371 * Per-vma policies are protected by mmap_lock. Allocations using per-task
372 * policies are protected by task->mems_allowed_seq to prevent a premature
373 * OOM/allocation failure due to parallel nodemask modification.
374 */
375static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
376{
377 if (!pol || pol->mode == MPOL_LOCAL)
378 return;
379 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
380 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
381 return;
382
383 mpol_ops[pol->mode].rebind(pol, newmask);
384}
385
386/*
387 * Wrapper for mpol_rebind_policy() that just requires task
388 * pointer, and updates task mempolicy.
389 *
390 * Called with task's alloc_lock held.
391 */
392
393void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
394{
395 mpol_rebind_policy(tsk->mempolicy, new);
396}
397
398/*
399 * Rebind each vma in mm to new nodemask.
400 *
401 * Call holding a reference to mm. Takes mm->mmap_lock during call.
402 */
403
404void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
405{
406 struct vm_area_struct *vma;
407
408 mmap_write_lock(mm);
409 for (vma = mm->mmap; vma; vma = vma->vm_next)
410 mpol_rebind_policy(vma->vm_policy, new);
411 mmap_write_unlock(mm);
412}
413
414static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
415 [MPOL_DEFAULT] = {
416 .rebind = mpol_rebind_default,
417 },
418 [MPOL_INTERLEAVE] = {
419 .create = mpol_new_interleave,
420 .rebind = mpol_rebind_nodemask,
421 },
422 [MPOL_PREFERRED] = {
423 .create = mpol_new_preferred,
424 .rebind = mpol_rebind_preferred,
425 },
426 [MPOL_BIND] = {
427 .create = mpol_new_bind,
428 .rebind = mpol_rebind_nodemask,
429 },
430};
431
432static int migrate_page_add(struct page *page, struct list_head *pagelist,
433 unsigned long flags);
434
435struct queue_pages {
436 struct list_head *pagelist;
437 unsigned long flags;
438 nodemask_t *nmask;
439 unsigned long start;
440 unsigned long end;
441 struct vm_area_struct *first;
442};
443
444/*
445 * Check if the page's nid is in qp->nmask.
446 *
447 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
448 * in the invert of qp->nmask.
449 */
450static inline bool queue_pages_required(struct page *page,
451 struct queue_pages *qp)
452{
453 int nid = page_to_nid(page);
454 unsigned long flags = qp->flags;
455
456 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
457}
458
459/*
460 * queue_pages_pmd() has four possible return values:
461 * 0 - pages are placed on the right node or queued successfully.
462 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
463 * specified.
464 * 2 - THP was split.
465 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
466 * existing page was already on a node that does not follow the
467 * policy.
468 */
469static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
470 unsigned long end, struct mm_walk *walk)
471 __releases(ptl)
472{
473 int ret = 0;
474 struct page *page;
475 struct queue_pages *qp = walk->private;
476 unsigned long flags;
477
478 if (unlikely(is_pmd_migration_entry(*pmd))) {
479 ret = -EIO;
480 goto unlock;
481 }
482 page = pmd_page(*pmd);
483 if (is_huge_zero_page(page)) {
484 spin_unlock(ptl);
485 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
486 ret = 2;
487 goto out;
488 }
489 if (!queue_pages_required(page, qp))
490 goto unlock;
491
492 flags = qp->flags;
493 /* go to thp migration */
494 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
495 if (!vma_migratable(walk->vma) ||
496 migrate_page_add(page, qp->pagelist, flags)) {
497 ret = 1;
498 goto unlock;
499 }
500 } else
501 ret = -EIO;
502unlock:
503 spin_unlock(ptl);
504out:
505 return ret;
506}
507
508/*
509 * Scan through pages checking if pages follow certain conditions,
510 * and move them to the pagelist if they do.
511 *
512 * queue_pages_pte_range() has three possible return values:
513 * 0 - pages are placed on the right node or queued successfully.
514 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
515 * specified.
516 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
517 * on a node that does not follow the policy.
518 */
519static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
520 unsigned long end, struct mm_walk *walk)
521{
522 struct vm_area_struct *vma = walk->vma;
523 struct page *page;
524 struct queue_pages *qp = walk->private;
525 unsigned long flags = qp->flags;
526 int ret;
527 bool has_unmovable = false;
528 pte_t *pte, *mapped_pte;
529 spinlock_t *ptl;
530
531 ptl = pmd_trans_huge_lock(pmd, vma);
532 if (ptl) {
533 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
534 if (ret != 2)
535 return ret;
536 }
537 /* THP was split, fall through to pte walk */
538
539 if (pmd_trans_unstable(pmd))
540 return 0;
541
542 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
543 for (; addr != end; pte++, addr += PAGE_SIZE) {
544 if (!pte_present(*pte))
545 continue;
546 page = vm_normal_page(vma, addr, *pte);
547 if (!page)
548 continue;
549 /*
550 * vm_normal_page() filters out zero pages, but there might
551 * still be PageReserved pages to skip, perhaps in a VDSO.
552 */
553 if (PageReserved(page))
554 continue;
555 if (!queue_pages_required(page, qp))
556 continue;
557 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
558 /* MPOL_MF_STRICT must be specified if we get here */
559 if (!vma_migratable(vma)) {
560 has_unmovable = true;
561 break;
562 }
563
564 /*
565 * Do not abort immediately since there may be
566 * temporary off LRU pages in the range. Still
567 * need migrate other LRU pages.
568 */
569 if (migrate_page_add(page, qp->pagelist, flags))
570 has_unmovable = true;
571 } else
572 break;
573 }
574 pte_unmap_unlock(mapped_pte, ptl);
575 cond_resched();
576
577 if (has_unmovable)
578 return 1;
579
580 return addr != end ? -EIO : 0;
581}
582
583static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
584 unsigned long addr, unsigned long end,
585 struct mm_walk *walk)
586{
587 int ret = 0;
588#ifdef CONFIG_HUGETLB_PAGE
589 struct queue_pages *qp = walk->private;
590 unsigned long flags = (qp->flags & MPOL_MF_VALID);
591 struct page *page;
592 spinlock_t *ptl;
593 pte_t entry;
594
595 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 entry = huge_ptep_get(pte);
597 if (!pte_present(entry))
598 goto unlock;
599 page = pte_page(entry);
600 if (!queue_pages_required(page, qp))
601 goto unlock;
602
603 if (flags == MPOL_MF_STRICT) {
604 /*
605 * STRICT alone means only detecting misplaced page and no
606 * need to further check other vma.
607 */
608 ret = -EIO;
609 goto unlock;
610 }
611
612 if (!vma_migratable(walk->vma)) {
613 /*
614 * Must be STRICT with MOVE*, otherwise .test_walk() have
615 * stopped walking current vma.
616 * Detecting misplaced page but allow migrating pages which
617 * have been queued.
618 */
619 ret = 1;
620 goto unlock;
621 }
622
623 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
624 if (flags & (MPOL_MF_MOVE_ALL) ||
625 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
626 if (!isolate_huge_page(page, qp->pagelist) &&
627 (flags & MPOL_MF_STRICT))
628 /*
629 * Failed to isolate page but allow migrating pages
630 * which have been queued.
631 */
632 ret = 1;
633 }
634unlock:
635 spin_unlock(ptl);
636#else
637 BUG();
638#endif
639 return ret;
640}
641
642#ifdef CONFIG_NUMA_BALANCING
643/*
644 * This is used to mark a range of virtual addresses to be inaccessible.
645 * These are later cleared by a NUMA hinting fault. Depending on these
646 * faults, pages may be migrated for better NUMA placement.
647 *
648 * This is assuming that NUMA faults are handled using PROT_NONE. If
649 * an architecture makes a different choice, it will need further
650 * changes to the core.
651 */
652unsigned long change_prot_numa(struct vm_area_struct *vma,
653 unsigned long addr, unsigned long end)
654{
655 int nr_updated;
656
657 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
658 if (nr_updated)
659 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
660
661 return nr_updated;
662}
663#else
664static unsigned long change_prot_numa(struct vm_area_struct *vma,
665 unsigned long addr, unsigned long end)
666{
667 return 0;
668}
669#endif /* CONFIG_NUMA_BALANCING */
670
671static int queue_pages_test_walk(unsigned long start, unsigned long end,
672 struct mm_walk *walk)
673{
674 struct vm_area_struct *vma = walk->vma;
675 struct queue_pages *qp = walk->private;
676 unsigned long endvma = vma->vm_end;
677 unsigned long flags = qp->flags;
678
679 /* range check first */
680 VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
681
682 if (!qp->first) {
683 qp->first = vma;
684 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
685 (qp->start < vma->vm_start))
686 /* hole at head side of range */
687 return -EFAULT;
688 }
689 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
690 ((vma->vm_end < qp->end) &&
691 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
692 /* hole at middle or tail of range */
693 return -EFAULT;
694
695 /*
696 * Need check MPOL_MF_STRICT to return -EIO if possible
697 * regardless of vma_migratable
698 */
699 if (!vma_migratable(vma) &&
700 !(flags & MPOL_MF_STRICT))
701 return 1;
702
703 if (endvma > end)
704 endvma = end;
705
706 if (flags & MPOL_MF_LAZY) {
707 /* Similar to task_numa_work, skip inaccessible VMAs */
708 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
709 !(vma->vm_flags & VM_MIXEDMAP))
710 change_prot_numa(vma, start, endvma);
711 return 1;
712 }
713
714 /* queue pages from current vma */
715 if (flags & MPOL_MF_VALID)
716 return 0;
717 return 1;
718}
719
720static const struct mm_walk_ops queue_pages_walk_ops = {
721 .hugetlb_entry = queue_pages_hugetlb,
722 .pmd_entry = queue_pages_pte_range,
723 .test_walk = queue_pages_test_walk,
724};
725
726/*
727 * Walk through page tables and collect pages to be migrated.
728 *
729 * If pages found in a given range are on a set of nodes (determined by
730 * @nodes and @flags,) it's isolated and queued to the pagelist which is
731 * passed via @private.
732 *
733 * queue_pages_range() has three possible return values:
734 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
735 * specified.
736 * 0 - queue pages successfully or no misplaced page.
737 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
738 * memory range specified by nodemask and maxnode points outside
739 * your accessible address space (-EFAULT)
740 */
741static int
742queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
743 nodemask_t *nodes, unsigned long flags,
744 struct list_head *pagelist)
745{
746 int err;
747 struct queue_pages qp = {
748 .pagelist = pagelist,
749 .flags = flags,
750 .nmask = nodes,
751 .start = start,
752 .end = end,
753 .first = NULL,
754 };
755
756 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
757
758 if (!qp.first)
759 /* whole range in hole */
760 err = -EFAULT;
761
762 return err;
763}
764
765/*
766 * Apply policy to a single VMA
767 * This must be called with the mmap_lock held for writing.
768 */
769static int vma_replace_policy(struct vm_area_struct *vma,
770 struct mempolicy *pol)
771{
772 int err;
773 struct mempolicy *old;
774 struct mempolicy *new;
775
776 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
777 vma->vm_start, vma->vm_end, vma->vm_pgoff,
778 vma->vm_ops, vma->vm_file,
779 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
780
781 new = mpol_dup(pol);
782 if (IS_ERR(new))
783 return PTR_ERR(new);
784
785 if (vma->vm_ops && vma->vm_ops->set_policy) {
786 err = vma->vm_ops->set_policy(vma, new);
787 if (err)
788 goto err_out;
789 }
790
791 old = vma->vm_policy;
792 vma->vm_policy = new; /* protected by mmap_lock */
793 mpol_put(old);
794
795 return 0;
796 err_out:
797 mpol_put(new);
798 return err;
799}
800
801/* Step 2: apply policy to a range and do splits. */
802static int mbind_range(struct mm_struct *mm, unsigned long start,
803 unsigned long end, struct mempolicy *new_pol)
804{
805 struct vm_area_struct *prev;
806 struct vm_area_struct *vma;
807 int err = 0;
808 pgoff_t pgoff;
809 unsigned long vmstart;
810 unsigned long vmend;
811
812 vma = find_vma(mm, start);
813 VM_BUG_ON(!vma);
814
815 prev = vma->vm_prev;
816 if (start > vma->vm_start)
817 prev = vma;
818
819 for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
820 vmstart = max(start, vma->vm_start);
821 vmend = min(end, vma->vm_end);
822
823 if (mpol_equal(vma_policy(vma), new_pol))
824 continue;
825
826 pgoff = vma->vm_pgoff +
827 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
828 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
829 vma->anon_vma, vma->vm_file, pgoff,
830 new_pol, vma->vm_userfaultfd_ctx,
831 anon_vma_name(vma));
832 if (prev) {
833 vma = prev;
834 goto replace;
835 }
836 if (vma->vm_start != vmstart) {
837 err = split_vma(vma->vm_mm, vma, vmstart, 1);
838 if (err)
839 goto out;
840 }
841 if (vma->vm_end != vmend) {
842 err = split_vma(vma->vm_mm, vma, vmend, 0);
843 if (err)
844 goto out;
845 }
846 replace:
847 err = vma_replace_policy(vma, new_pol);
848 if (err)
849 goto out;
850 }
851
852 out:
853 return err;
854}
855
856/* Set the process memory policy */
857static long do_set_mempolicy(unsigned short mode, unsigned short flags,
858 nodemask_t *nodes)
859{
860 struct mempolicy *new, *old;
861 NODEMASK_SCRATCH(scratch);
862 int ret;
863
864 if (!scratch)
865 return -ENOMEM;
866
867 new = mpol_new(mode, flags, nodes);
868 if (IS_ERR(new)) {
869 ret = PTR_ERR(new);
870 goto out;
871 }
872
873 ret = mpol_set_nodemask(new, nodes, scratch);
874 if (ret) {
875 mpol_put(new);
876 goto out;
877 }
878 task_lock(current);
879 old = current->mempolicy;
880 current->mempolicy = new;
881 if (new && new->mode == MPOL_INTERLEAVE)
882 current->il_prev = MAX_NUMNODES-1;
883 task_unlock(current);
884 mpol_put(old);
885 ret = 0;
886out:
887 NODEMASK_SCRATCH_FREE(scratch);
888 return ret;
889}
890
891/*
892 * Return nodemask for policy for get_mempolicy() query
893 *
894 * Called with task's alloc_lock held
895 */
896static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
897{
898 nodes_clear(*nodes);
899 if (p == &default_policy)
900 return;
901
902 switch (p->mode) {
903 case MPOL_BIND:
904 case MPOL_INTERLEAVE:
905 *nodes = p->v.nodes;
906 break;
907 case MPOL_PREFERRED:
908 if (!(p->flags & MPOL_F_LOCAL))
909 node_set(p->v.preferred_node, *nodes);
910 /* else return empty node mask for local allocation */
911 break;
912 default:
913 BUG();
914 }
915}
916
917static int lookup_node(struct mm_struct *mm, unsigned long addr)
918{
919 struct page *p = NULL;
920 int err;
921
922 int locked = 1;
923 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
924 if (err > 0) {
925 err = page_to_nid(p);
926 put_page(p);
927 }
928 if (locked)
929 mmap_read_unlock(mm);
930 return err;
931}
932
933/* Retrieve NUMA policy */
934static long do_get_mempolicy(int *policy, nodemask_t *nmask,
935 unsigned long addr, unsigned long flags)
936{
937 int err;
938 struct mm_struct *mm = current->mm;
939 struct vm_area_struct *vma = NULL;
940 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
941
942 if (flags &
943 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
944 return -EINVAL;
945
946 if (flags & MPOL_F_MEMS_ALLOWED) {
947 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
948 return -EINVAL;
949 *policy = 0; /* just so it's initialized */
950 task_lock(current);
951 *nmask = cpuset_current_mems_allowed;
952 task_unlock(current);
953 return 0;
954 }
955
956 if (flags & MPOL_F_ADDR) {
957 /*
958 * Do NOT fall back to task policy if the
959 * vma/shared policy at addr is NULL. We
960 * want to return MPOL_DEFAULT in this case.
961 */
962 mmap_read_lock(mm);
963 vma = find_vma_intersection(mm, addr, addr+1);
964 if (!vma) {
965 mmap_read_unlock(mm);
966 return -EFAULT;
967 }
968 if (vma->vm_ops && vma->vm_ops->get_policy)
969 pol = vma->vm_ops->get_policy(vma, addr);
970 else
971 pol = vma->vm_policy;
972 } else if (addr)
973 return -EINVAL;
974
975 if (!pol)
976 pol = &default_policy; /* indicates default behavior */
977
978 if (flags & MPOL_F_NODE) {
979 if (flags & MPOL_F_ADDR) {
980 /*
981 * Take a refcount on the mpol, lookup_node()
982 * wil drop the mmap_lock, so after calling
983 * lookup_node() only "pol" remains valid, "vma"
984 * is stale.
985 */
986 pol_refcount = pol;
987 vma = NULL;
988 mpol_get(pol);
989 err = lookup_node(mm, addr);
990 if (err < 0)
991 goto out;
992 *policy = err;
993 } else if (pol == current->mempolicy &&
994 pol->mode == MPOL_INTERLEAVE) {
995 *policy = next_node_in(current->il_prev, pol->v.nodes);
996 } else {
997 err = -EINVAL;
998 goto out;
999 }
1000 } else {
1001 *policy = pol == &default_policy ? MPOL_DEFAULT :
1002 pol->mode;
1003 /*
1004 * Internal mempolicy flags must be masked off before exposing
1005 * the policy to userspace.
1006 */
1007 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1008 }
1009
1010 err = 0;
1011 if (nmask) {
1012 if (mpol_store_user_nodemask(pol)) {
1013 *nmask = pol->w.user_nodemask;
1014 } else {
1015 task_lock(current);
1016 get_policy_nodemask(pol, nmask);
1017 task_unlock(current);
1018 }
1019 }
1020
1021 out:
1022 mpol_cond_put(pol);
1023 if (vma)
1024 mmap_read_unlock(mm);
1025 if (pol_refcount)
1026 mpol_put(pol_refcount);
1027 return err;
1028}
1029
1030#ifdef CONFIG_MIGRATION
1031/*
1032 * page migration, thp tail pages can be passed.
1033 */
1034static int migrate_page_add(struct page *page, struct list_head *pagelist,
1035 unsigned long flags)
1036{
1037 struct page *head = compound_head(page);
1038 /*
1039 * Avoid migrating a page that is shared with others.
1040 */
1041 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1042 if (!isolate_lru_page(head)) {
1043 list_add_tail(&head->lru, pagelist);
1044 mod_node_page_state(page_pgdat(head),
1045 NR_ISOLATED_ANON + page_is_file_lru(head),
1046 thp_nr_pages(head));
1047 } else if (flags & MPOL_MF_STRICT) {
1048 /*
1049 * Non-movable page may reach here. And, there may be
1050 * temporary off LRU pages or non-LRU movable pages.
1051 * Treat them as unmovable pages since they can't be
1052 * isolated, so they can't be moved at the moment. It
1053 * should return -EIO for this case too.
1054 */
1055 return -EIO;
1056 }
1057 }
1058
1059 return 0;
1060}
1061
1062/*
1063 * Migrate pages from one node to a target node.
1064 * Returns error or the number of pages not migrated.
1065 */
1066static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1067 int flags)
1068{
1069 nodemask_t nmask;
1070 LIST_HEAD(pagelist);
1071 int err = 0;
1072 struct migration_target_control mtc = {
1073 .nid = dest,
1074 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1075 };
1076
1077 nodes_clear(nmask);
1078 node_set(source, nmask);
1079
1080 /*
1081 * This does not "check" the range but isolates all pages that
1082 * need migration. Between passing in the full user address
1083 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1084 */
1085 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1086 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1087 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1088
1089 if (!list_empty(&pagelist)) {
1090 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1091 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1092 if (err)
1093 putback_movable_pages(&pagelist);
1094 }
1095
1096 return err;
1097}
1098
1099/*
1100 * Move pages between the two nodesets so as to preserve the physical
1101 * layout as much as possible.
1102 *
1103 * Returns the number of page that could not be moved.
1104 */
1105int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1106 const nodemask_t *to, int flags)
1107{
1108 int busy = 0;
1109 int err;
1110 nodemask_t tmp;
1111
1112 err = migrate_prep();
1113 if (err)
1114 return err;
1115
1116 mmap_read_lock(mm);
1117
1118 /*
1119 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1120 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1121 * bit in 'tmp', and return that <source, dest> pair for migration.
1122 * The pair of nodemasks 'to' and 'from' define the map.
1123 *
1124 * If no pair of bits is found that way, fallback to picking some
1125 * pair of 'source' and 'dest' bits that are not the same. If the
1126 * 'source' and 'dest' bits are the same, this represents a node
1127 * that will be migrating to itself, so no pages need move.
1128 *
1129 * If no bits are left in 'tmp', or if all remaining bits left
1130 * in 'tmp' correspond to the same bit in 'to', return false
1131 * (nothing left to migrate).
1132 *
1133 * This lets us pick a pair of nodes to migrate between, such that
1134 * if possible the dest node is not already occupied by some other
1135 * source node, minimizing the risk of overloading the memory on a
1136 * node that would happen if we migrated incoming memory to a node
1137 * before migrating outgoing memory source that same node.
1138 *
1139 * A single scan of tmp is sufficient. As we go, we remember the
1140 * most recent <s, d> pair that moved (s != d). If we find a pair
1141 * that not only moved, but what's better, moved to an empty slot
1142 * (d is not set in tmp), then we break out then, with that pair.
1143 * Otherwise when we finish scanning from_tmp, we at least have the
1144 * most recent <s, d> pair that moved. If we get all the way through
1145 * the scan of tmp without finding any node that moved, much less
1146 * moved to an empty node, then there is nothing left worth migrating.
1147 */
1148
1149 tmp = *from;
1150 while (!nodes_empty(tmp)) {
1151 int s,d;
1152 int source = NUMA_NO_NODE;
1153 int dest = 0;
1154
1155 for_each_node_mask(s, tmp) {
1156
1157 /*
1158 * do_migrate_pages() tries to maintain the relative
1159 * node relationship of the pages established between
1160 * threads and memory areas.
1161 *
1162 * However if the number of source nodes is not equal to
1163 * the number of destination nodes we can not preserve
1164 * this node relative relationship. In that case, skip
1165 * copying memory from a node that is in the destination
1166 * mask.
1167 *
1168 * Example: [2,3,4] -> [3,4,5] moves everything.
1169 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1170 */
1171
1172 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1173 (node_isset(s, *to)))
1174 continue;
1175
1176 d = node_remap(s, *from, *to);
1177 if (s == d)
1178 continue;
1179
1180 source = s; /* Node moved. Memorize */
1181 dest = d;
1182
1183 /* dest not in remaining from nodes? */
1184 if (!node_isset(dest, tmp))
1185 break;
1186 }
1187 if (source == NUMA_NO_NODE)
1188 break;
1189
1190 node_clear(source, tmp);
1191 err = migrate_to_node(mm, source, dest, flags);
1192 if (err > 0)
1193 busy += err;
1194 if (err < 0)
1195 break;
1196 }
1197 mmap_read_unlock(mm);
1198 if (err < 0)
1199 return err;
1200 return busy;
1201
1202}
1203
1204/*
1205 * Allocate a new page for page migration based on vma policy.
1206 * Start by assuming the page is mapped by the same vma as contains @start.
1207 * Search forward from there, if not. N.B., this assumes that the
1208 * list of pages handed to migrate_pages()--which is how we get here--
1209 * is in virtual address order.
1210 */
1211static struct page *new_page(struct page *page, unsigned long start)
1212{
1213 struct vm_area_struct *vma;
1214 unsigned long address;
1215
1216 vma = find_vma(current->mm, start);
1217 while (vma) {
1218 address = page_address_in_vma(page, vma);
1219 if (address != -EFAULT)
1220 break;
1221 vma = vma->vm_next;
1222 }
1223
1224 if (PageHuge(page)) {
1225 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1226 vma, address);
1227 } else if (PageTransHuge(page)) {
1228 struct page *thp;
1229
1230 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1231 HPAGE_PMD_ORDER);
1232 if (!thp)
1233 return NULL;
1234 prep_transhuge_page(thp);
1235 return thp;
1236 }
1237 /*
1238 * if !vma, alloc_page_vma() will use task or system default policy
1239 */
1240 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1241 vma, address);
1242}
1243#else
1244
1245static int migrate_page_add(struct page *page, struct list_head *pagelist,
1246 unsigned long flags)
1247{
1248 return -EIO;
1249}
1250
1251int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1252 const nodemask_t *to, int flags)
1253{
1254 return -ENOSYS;
1255}
1256
1257static struct page *new_page(struct page *page, unsigned long start)
1258{
1259 return NULL;
1260}
1261#endif
1262
1263static long do_mbind(unsigned long start, unsigned long len,
1264 unsigned short mode, unsigned short mode_flags,
1265 nodemask_t *nmask, unsigned long flags)
1266{
1267 struct mm_struct *mm = current->mm;
1268 struct mempolicy *new;
1269 unsigned long end;
1270 int err;
1271 int ret;
1272 LIST_HEAD(pagelist);
1273
1274 if (flags & ~(unsigned long)MPOL_MF_VALID)
1275 return -EINVAL;
1276 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1277 return -EPERM;
1278
1279 if (start & ~PAGE_MASK)
1280 return -EINVAL;
1281
1282 if (mode == MPOL_DEFAULT)
1283 flags &= ~MPOL_MF_STRICT;
1284
1285 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1286 end = start + len;
1287
1288 if (end < start)
1289 return -EINVAL;
1290 if (end == start)
1291 return 0;
1292
1293 new = mpol_new(mode, mode_flags, nmask);
1294 if (IS_ERR(new))
1295 return PTR_ERR(new);
1296
1297 if (flags & MPOL_MF_LAZY)
1298 new->flags |= MPOL_F_MOF;
1299
1300 /*
1301 * If we are using the default policy then operation
1302 * on discontinuous address spaces is okay after all
1303 */
1304 if (!new)
1305 flags |= MPOL_MF_DISCONTIG_OK;
1306
1307 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1308 start, start + len, mode, mode_flags,
1309 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1310
1311 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1312
1313 err = migrate_prep();
1314 if (err)
1315 goto mpol_out;
1316 }
1317 {
1318 NODEMASK_SCRATCH(scratch);
1319 if (scratch) {
1320 mmap_write_lock(mm);
1321 err = mpol_set_nodemask(new, nmask, scratch);
1322 if (err)
1323 mmap_write_unlock(mm);
1324 } else
1325 err = -ENOMEM;
1326 NODEMASK_SCRATCH_FREE(scratch);
1327 }
1328 if (err)
1329 goto mpol_out;
1330
1331 ret = queue_pages_range(mm, start, end, nmask,
1332 flags | MPOL_MF_INVERT, &pagelist);
1333
1334 if (ret < 0) {
1335 err = ret;
1336 goto up_out;
1337 }
1338
1339 err = mbind_range(mm, start, end, new);
1340
1341 if (!err) {
1342 int nr_failed = 0;
1343
1344 if (!list_empty(&pagelist)) {
1345 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1346 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1347 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1348 if (nr_failed)
1349 putback_movable_pages(&pagelist);
1350 }
1351
1352 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1353 err = -EIO;
1354 } else {
1355up_out:
1356 if (!list_empty(&pagelist))
1357 putback_movable_pages(&pagelist);
1358 }
1359
1360 mmap_write_unlock(mm);
1361mpol_out:
1362 mpol_put(new);
1363 return err;
1364}
1365
1366/*
1367 * User space interface with variable sized bitmaps for nodelists.
1368 */
1369
1370/* Copy a node mask from user space. */
1371static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1372 unsigned long maxnode)
1373{
1374 unsigned long k;
1375 unsigned long t;
1376 unsigned long nlongs;
1377 unsigned long endmask;
1378
1379 --maxnode;
1380 nodes_clear(*nodes);
1381 if (maxnode == 0 || !nmask)
1382 return 0;
1383 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1384 return -EINVAL;
1385
1386 nlongs = BITS_TO_LONGS(maxnode);
1387 if ((maxnode % BITS_PER_LONG) == 0)
1388 endmask = ~0UL;
1389 else
1390 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1391
1392 /*
1393 * When the user specified more nodes than supported just check
1394 * if the non supported part is all zero.
1395 *
1396 * If maxnode have more longs than MAX_NUMNODES, check
1397 * the bits in that area first. And then go through to
1398 * check the rest bits which equal or bigger than MAX_NUMNODES.
1399 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1400 */
1401 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1402 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1403 if (get_user(t, nmask + k))
1404 return -EFAULT;
1405 if (k == nlongs - 1) {
1406 if (t & endmask)
1407 return -EINVAL;
1408 } else if (t)
1409 return -EINVAL;
1410 }
1411 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1412 endmask = ~0UL;
1413 }
1414
1415 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1416 unsigned long valid_mask = endmask;
1417
1418 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1419 if (get_user(t, nmask + nlongs - 1))
1420 return -EFAULT;
1421 if (t & valid_mask)
1422 return -EINVAL;
1423 }
1424
1425 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1426 return -EFAULT;
1427 nodes_addr(*nodes)[nlongs-1] &= endmask;
1428 return 0;
1429}
1430
1431/* Copy a kernel node mask to user space */
1432static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1433 nodemask_t *nodes)
1434{
1435 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1436 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1437
1438 if (copy > nbytes) {
1439 if (copy > PAGE_SIZE)
1440 return -EINVAL;
1441 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1442 return -EFAULT;
1443 copy = nbytes;
1444 }
1445 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1446}
1447
1448static long kernel_mbind(unsigned long start, unsigned long len,
1449 unsigned long mode, const unsigned long __user *nmask,
1450 unsigned long maxnode, unsigned int flags)
1451{
1452 nodemask_t nodes;
1453 int err;
1454 unsigned short mode_flags;
1455
1456 start = untagged_addr(start);
1457 mode_flags = mode & MPOL_MODE_FLAGS;
1458 mode &= ~MPOL_MODE_FLAGS;
1459 if (mode >= MPOL_MAX)
1460 return -EINVAL;
1461 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1462 (mode_flags & MPOL_F_RELATIVE_NODES))
1463 return -EINVAL;
1464 err = get_nodes(&nodes, nmask, maxnode);
1465 if (err)
1466 return err;
1467 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1468}
1469
1470SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1471 unsigned long, mode, const unsigned long __user *, nmask,
1472 unsigned long, maxnode, unsigned int, flags)
1473{
1474 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1475}
1476
1477/* Set the process memory policy */
1478static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1479 unsigned long maxnode)
1480{
1481 int err;
1482 nodemask_t nodes;
1483 unsigned short flags;
1484
1485 flags = mode & MPOL_MODE_FLAGS;
1486 mode &= ~MPOL_MODE_FLAGS;
1487 if ((unsigned int)mode >= MPOL_MAX)
1488 return -EINVAL;
1489 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1490 return -EINVAL;
1491 err = get_nodes(&nodes, nmask, maxnode);
1492 if (err)
1493 return err;
1494 return do_set_mempolicy(mode, flags, &nodes);
1495}
1496
1497SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1498 unsigned long, maxnode)
1499{
1500 return kernel_set_mempolicy(mode, nmask, maxnode);
1501}
1502
1503static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1504 const unsigned long __user *old_nodes,
1505 const unsigned long __user *new_nodes)
1506{
1507 struct mm_struct *mm = NULL;
1508 struct task_struct *task;
1509 nodemask_t task_nodes;
1510 int err;
1511 nodemask_t *old;
1512 nodemask_t *new;
1513 NODEMASK_SCRATCH(scratch);
1514
1515 if (!scratch)
1516 return -ENOMEM;
1517
1518 old = &scratch->mask1;
1519 new = &scratch->mask2;
1520
1521 err = get_nodes(old, old_nodes, maxnode);
1522 if (err)
1523 goto out;
1524
1525 err = get_nodes(new, new_nodes, maxnode);
1526 if (err)
1527 goto out;
1528
1529 /* Find the mm_struct */
1530 rcu_read_lock();
1531 task = pid ? find_task_by_vpid(pid) : current;
1532 if (!task) {
1533 rcu_read_unlock();
1534 err = -ESRCH;
1535 goto out;
1536 }
1537 get_task_struct(task);
1538
1539 err = -EINVAL;
1540
1541 /*
1542 * Check if this process has the right to modify the specified process.
1543 * Use the regular "ptrace_may_access()" checks.
1544 */
1545 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1546 rcu_read_unlock();
1547 err = -EPERM;
1548 goto out_put;
1549 }
1550 rcu_read_unlock();
1551
1552 task_nodes = cpuset_mems_allowed(task);
1553 /* Is the user allowed to access the target nodes? */
1554 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1555 err = -EPERM;
1556 goto out_put;
1557 }
1558
1559 task_nodes = cpuset_mems_allowed(current);
1560 nodes_and(*new, *new, task_nodes);
1561 if (nodes_empty(*new))
1562 goto out_put;
1563
1564 err = security_task_movememory(task);
1565 if (err)
1566 goto out_put;
1567
1568 mm = get_task_mm(task);
1569 put_task_struct(task);
1570
1571 if (!mm) {
1572 err = -EINVAL;
1573 goto out;
1574 }
1575
1576 err = do_migrate_pages(mm, old, new,
1577 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1578
1579 mmput(mm);
1580out:
1581 NODEMASK_SCRATCH_FREE(scratch);
1582
1583 return err;
1584
1585out_put:
1586 put_task_struct(task);
1587 goto out;
1588
1589}
1590
1591SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1592 const unsigned long __user *, old_nodes,
1593 const unsigned long __user *, new_nodes)
1594{
1595 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1596}
1597
1598
1599/* Retrieve NUMA policy */
1600static int kernel_get_mempolicy(int __user *policy,
1601 unsigned long __user *nmask,
1602 unsigned long maxnode,
1603 unsigned long addr,
1604 unsigned long flags)
1605{
1606 int err;
1607 int pval;
1608 nodemask_t nodes;
1609
1610 if (nmask != NULL && maxnode < nr_node_ids)
1611 return -EINVAL;
1612
1613 addr = untagged_addr(addr);
1614
1615 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1616
1617 if (err)
1618 return err;
1619
1620 if (policy && put_user(pval, policy))
1621 return -EFAULT;
1622
1623 if (nmask)
1624 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1625
1626 return err;
1627}
1628
1629SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1630 unsigned long __user *, nmask, unsigned long, maxnode,
1631 unsigned long, addr, unsigned long, flags)
1632{
1633 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1634}
1635
1636#ifdef CONFIG_COMPAT
1637
1638COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1639 compat_ulong_t __user *, nmask,
1640 compat_ulong_t, maxnode,
1641 compat_ulong_t, addr, compat_ulong_t, flags)
1642{
1643 long err;
1644 unsigned long __user *nm = NULL;
1645 unsigned long nr_bits, alloc_size;
1646 DECLARE_BITMAP(bm, MAX_NUMNODES);
1647
1648 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1649 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1650
1651 if (nmask)
1652 nm = compat_alloc_user_space(alloc_size);
1653
1654 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1655
1656 if (!err && nmask) {
1657 unsigned long copy_size;
1658 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1659 err = copy_from_user(bm, nm, copy_size);
1660 /* ensure entire bitmap is zeroed */
1661 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1662 err |= compat_put_bitmap(nmask, bm, nr_bits);
1663 }
1664
1665 return err;
1666}
1667
1668COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1669 compat_ulong_t, maxnode)
1670{
1671 unsigned long __user *nm = NULL;
1672 unsigned long nr_bits, alloc_size;
1673 DECLARE_BITMAP(bm, MAX_NUMNODES);
1674
1675 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1676 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1677
1678 if (nmask) {
1679 if (compat_get_bitmap(bm, nmask, nr_bits))
1680 return -EFAULT;
1681 nm = compat_alloc_user_space(alloc_size);
1682 if (copy_to_user(nm, bm, alloc_size))
1683 return -EFAULT;
1684 }
1685
1686 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1687}
1688
1689COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1690 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1691 compat_ulong_t, maxnode, compat_ulong_t, flags)
1692{
1693 unsigned long __user *nm = NULL;
1694 unsigned long nr_bits, alloc_size;
1695 nodemask_t bm;
1696
1697 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1698 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1699
1700 if (nmask) {
1701 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1702 return -EFAULT;
1703 nm = compat_alloc_user_space(alloc_size);
1704 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1705 return -EFAULT;
1706 }
1707
1708 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1709}
1710
1711COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1712 compat_ulong_t, maxnode,
1713 const compat_ulong_t __user *, old_nodes,
1714 const compat_ulong_t __user *, new_nodes)
1715{
1716 unsigned long __user *old = NULL;
1717 unsigned long __user *new = NULL;
1718 nodemask_t tmp_mask;
1719 unsigned long nr_bits;
1720 unsigned long size;
1721
1722 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1723 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1724 if (old_nodes) {
1725 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1726 return -EFAULT;
1727 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1728 if (new_nodes)
1729 new = old + size / sizeof(unsigned long);
1730 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1731 return -EFAULT;
1732 }
1733 if (new_nodes) {
1734 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1735 return -EFAULT;
1736 if (new == NULL)
1737 new = compat_alloc_user_space(size);
1738 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1739 return -EFAULT;
1740 }
1741 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1742}
1743
1744#endif /* CONFIG_COMPAT */
1745
1746bool vma_migratable(struct vm_area_struct *vma)
1747{
1748 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1749 return false;
1750
1751 /*
1752 * DAX device mappings require predictable access latency, so avoid
1753 * incurring periodic faults.
1754 */
1755 if (vma_is_dax(vma))
1756 return false;
1757
1758 if (is_vm_hugetlb_page(vma) &&
1759 !hugepage_migration_supported(hstate_vma(vma)))
1760 return false;
1761
1762 /*
1763 * Migration allocates pages in the highest zone. If we cannot
1764 * do so then migration (at least from node to node) is not
1765 * possible.
1766 */
1767 if (vma->vm_file &&
1768 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1769 < policy_zone)
1770 return false;
1771 return true;
1772}
1773
1774struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1775 unsigned long addr)
1776{
1777 struct mempolicy *pol = NULL;
1778
1779 if (vma) {
1780 if (vma->vm_ops && vma->vm_ops->get_policy) {
1781 pol = vma->vm_ops->get_policy(vma, addr);
1782 } else if (vma->vm_policy) {
1783 pol = vma->vm_policy;
1784
1785 /*
1786 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1787 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1788 * count on these policies which will be dropped by
1789 * mpol_cond_put() later
1790 */
1791 if (mpol_needs_cond_ref(pol))
1792 mpol_get(pol);
1793 }
1794 }
1795
1796 return pol;
1797}
1798
1799/*
1800 * get_vma_policy(@vma, @addr)
1801 * @vma: virtual memory area whose policy is sought
1802 * @addr: address in @vma for shared policy lookup
1803 *
1804 * Returns effective policy for a VMA at specified address.
1805 * Falls back to current->mempolicy or system default policy, as necessary.
1806 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1807 * count--added by the get_policy() vm_op, as appropriate--to protect against
1808 * freeing by another task. It is the caller's responsibility to free the
1809 * extra reference for shared policies.
1810 */
1811static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1812 unsigned long addr)
1813{
1814 struct mempolicy *pol = __get_vma_policy(vma, addr);
1815
1816 if (!pol)
1817 pol = get_task_policy(current);
1818
1819 return pol;
1820}
1821
1822bool vma_policy_mof(struct vm_area_struct *vma)
1823{
1824 struct mempolicy *pol;
1825
1826 if (vma->vm_ops && vma->vm_ops->get_policy) {
1827 bool ret = false;
1828
1829 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1830 if (pol && (pol->flags & MPOL_F_MOF))
1831 ret = true;
1832 mpol_cond_put(pol);
1833
1834 return ret;
1835 }
1836
1837 pol = vma->vm_policy;
1838 if (!pol)
1839 pol = get_task_policy(current);
1840
1841 return pol->flags & MPOL_F_MOF;
1842}
1843
1844static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1845{
1846 enum zone_type dynamic_policy_zone = policy_zone;
1847
1848 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1849
1850 /*
1851 * if policy->v.nodes has movable memory only,
1852 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1853 *
1854 * policy->v.nodes is intersect with node_states[N_MEMORY].
1855 * so if the following test faile, it implies
1856 * policy->v.nodes has movable memory only.
1857 */
1858 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1859 dynamic_policy_zone = ZONE_MOVABLE;
1860
1861 return zone >= dynamic_policy_zone;
1862}
1863
1864/*
1865 * Return a nodemask representing a mempolicy for filtering nodes for
1866 * page allocation
1867 */
1868nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1869{
1870 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1871 if (unlikely(policy->mode == MPOL_BIND) &&
1872 apply_policy_zone(policy, gfp_zone(gfp)) &&
1873 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1874 return &policy->v.nodes;
1875
1876 return NULL;
1877}
1878
1879/* Return the node id preferred by the given mempolicy, or the given id */
1880static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1881{
1882 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1883 nd = policy->v.preferred_node;
1884 else {
1885 /*
1886 * __GFP_THISNODE shouldn't even be used with the bind policy
1887 * because we might easily break the expectation to stay on the
1888 * requested node and not break the policy.
1889 */
1890 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1891 }
1892
1893 return nd;
1894}
1895
1896/* Do dynamic interleaving for a process */
1897static unsigned interleave_nodes(struct mempolicy *policy)
1898{
1899 unsigned next;
1900 struct task_struct *me = current;
1901
1902 next = next_node_in(me->il_prev, policy->v.nodes);
1903 if (next < MAX_NUMNODES)
1904 me->il_prev = next;
1905 return next;
1906}
1907
1908/*
1909 * Depending on the memory policy provide a node from which to allocate the
1910 * next slab entry.
1911 */
1912unsigned int mempolicy_slab_node(void)
1913{
1914 struct mempolicy *policy;
1915 int node = numa_mem_id();
1916
1917 if (in_interrupt())
1918 return node;
1919
1920 policy = current->mempolicy;
1921 if (!policy || policy->flags & MPOL_F_LOCAL)
1922 return node;
1923
1924 switch (policy->mode) {
1925 case MPOL_PREFERRED:
1926 /*
1927 * handled MPOL_F_LOCAL above
1928 */
1929 return policy->v.preferred_node;
1930
1931 case MPOL_INTERLEAVE:
1932 return interleave_nodes(policy);
1933
1934 case MPOL_BIND: {
1935 struct zoneref *z;
1936
1937 /*
1938 * Follow bind policy behavior and start allocation at the
1939 * first node.
1940 */
1941 struct zonelist *zonelist;
1942 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1943 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1944 z = first_zones_zonelist(zonelist, highest_zoneidx,
1945 &policy->v.nodes);
1946 return z->zone ? zone_to_nid(z->zone) : node;
1947 }
1948
1949 default:
1950 BUG();
1951 }
1952}
1953
1954/*
1955 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1956 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1957 * number of present nodes.
1958 */
1959static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1960{
1961 unsigned nnodes = nodes_weight(pol->v.nodes);
1962 unsigned target;
1963 int i;
1964 int nid;
1965
1966 if (!nnodes)
1967 return numa_node_id();
1968 target = (unsigned int)n % nnodes;
1969 nid = first_node(pol->v.nodes);
1970 for (i = 0; i < target; i++)
1971 nid = next_node(nid, pol->v.nodes);
1972 return nid;
1973}
1974
1975/* Determine a node number for interleave */
1976static inline unsigned interleave_nid(struct mempolicy *pol,
1977 struct vm_area_struct *vma, unsigned long addr, int shift)
1978{
1979 if (vma) {
1980 unsigned long off;
1981
1982 /*
1983 * for small pages, there is no difference between
1984 * shift and PAGE_SHIFT, so the bit-shift is safe.
1985 * for huge pages, since vm_pgoff is in units of small
1986 * pages, we need to shift off the always 0 bits to get
1987 * a useful offset.
1988 */
1989 BUG_ON(shift < PAGE_SHIFT);
1990 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1991 off += (addr - vma->vm_start) >> shift;
1992 return offset_il_node(pol, off);
1993 } else
1994 return interleave_nodes(pol);
1995}
1996
1997#ifdef CONFIG_HUGETLBFS
1998/*
1999 * huge_node(@vma, @addr, @gfp_flags, @mpol)
2000 * @vma: virtual memory area whose policy is sought
2001 * @addr: address in @vma for shared policy lookup and interleave policy
2002 * @gfp_flags: for requested zone
2003 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2004 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
2005 *
2006 * Returns a nid suitable for a huge page allocation and a pointer
2007 * to the struct mempolicy for conditional unref after allocation.
2008 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
2009 * @nodemask for filtering the zonelist.
2010 *
2011 * Must be protected by read_mems_allowed_begin()
2012 */
2013int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2014 struct mempolicy **mpol, nodemask_t **nodemask)
2015{
2016 int nid;
2017
2018 *mpol = get_vma_policy(vma, addr);
2019 *nodemask = NULL; /* assume !MPOL_BIND */
2020
2021 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
2022 nid = interleave_nid(*mpol, vma, addr,
2023 huge_page_shift(hstate_vma(vma)));
2024 } else {
2025 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2026 if ((*mpol)->mode == MPOL_BIND)
2027 *nodemask = &(*mpol)->v.nodes;
2028 }
2029 return nid;
2030}
2031
2032/*
2033 * init_nodemask_of_mempolicy
2034 *
2035 * If the current task's mempolicy is "default" [NULL], return 'false'
2036 * to indicate default policy. Otherwise, extract the policy nodemask
2037 * for 'bind' or 'interleave' policy into the argument nodemask, or
2038 * initialize the argument nodemask to contain the single node for
2039 * 'preferred' or 'local' policy and return 'true' to indicate presence
2040 * of non-default mempolicy.
2041 *
2042 * We don't bother with reference counting the mempolicy [mpol_get/put]
2043 * because the current task is examining it's own mempolicy and a task's
2044 * mempolicy is only ever changed by the task itself.
2045 *
2046 * N.B., it is the caller's responsibility to free a returned nodemask.
2047 */
2048bool init_nodemask_of_mempolicy(nodemask_t *mask)
2049{
2050 struct mempolicy *mempolicy;
2051 int nid;
2052
2053 if (!(mask && current->mempolicy))
2054 return false;
2055
2056 task_lock(current);
2057 mempolicy = current->mempolicy;
2058 switch (mempolicy->mode) {
2059 case MPOL_PREFERRED:
2060 if (mempolicy->flags & MPOL_F_LOCAL)
2061 nid = numa_node_id();
2062 else
2063 nid = mempolicy->v.preferred_node;
2064 init_nodemask_of_node(mask, nid);
2065 break;
2066
2067 case MPOL_BIND:
2068 case MPOL_INTERLEAVE:
2069 *mask = mempolicy->v.nodes;
2070 break;
2071
2072 default:
2073 BUG();
2074 }
2075 task_unlock(current);
2076
2077 return true;
2078}
2079#endif
2080
2081/*
2082 * mempolicy_nodemask_intersects
2083 *
2084 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2085 * policy. Otherwise, check for intersection between mask and the policy
2086 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
2087 * policy, always return true since it may allocate elsewhere on fallback.
2088 *
2089 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2090 */
2091bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2092 const nodemask_t *mask)
2093{
2094 struct mempolicy *mempolicy;
2095 bool ret = true;
2096
2097 if (!mask)
2098 return ret;
2099 task_lock(tsk);
2100 mempolicy = tsk->mempolicy;
2101 if (!mempolicy)
2102 goto out;
2103
2104 switch (mempolicy->mode) {
2105 case MPOL_PREFERRED:
2106 /*
2107 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2108 * allocate from, they may fallback to other nodes when oom.
2109 * Thus, it's possible for tsk to have allocated memory from
2110 * nodes in mask.
2111 */
2112 break;
2113 case MPOL_BIND:
2114 case MPOL_INTERLEAVE:
2115 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2116 break;
2117 default:
2118 BUG();
2119 }
2120out:
2121 task_unlock(tsk);
2122 return ret;
2123}
2124
2125/* Allocate a page in interleaved policy.
2126 Own path because it needs to do special accounting. */
2127static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2128 unsigned nid)
2129{
2130 struct page *page;
2131
2132 page = __alloc_pages(gfp, order, nid);
2133 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2134 if (!static_branch_likely(&vm_numa_stat_key))
2135 return page;
2136 if (page && page_to_nid(page) == nid) {
2137 preempt_disable();
2138 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2139 preempt_enable();
2140 }
2141 return page;
2142}
2143
2144/**
2145 * alloc_pages_vma - Allocate a page for a VMA.
2146 *
2147 * @gfp:
2148 * %GFP_USER user allocation.
2149 * %GFP_KERNEL kernel allocations,
2150 * %GFP_HIGHMEM highmem/user allocations,
2151 * %GFP_FS allocation should not call back into a file system.
2152 * %GFP_ATOMIC don't sleep.
2153 *
2154 * @order:Order of the GFP allocation.
2155 * @vma: Pointer to VMA or NULL if not available.
2156 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2157 * @node: Which node to prefer for allocation (modulo policy).
2158 * @hugepage: for hugepages try only the preferred node if possible
2159 *
2160 * This function allocates a page from the kernel page pool and applies
2161 * a NUMA policy associated with the VMA or the current process.
2162 * When VMA is not NULL caller must read-lock the mmap_lock of the
2163 * mm_struct of the VMA to prevent it from going away. Should be used for
2164 * all allocations for pages that will be mapped into user space. Returns
2165 * NULL when no page can be allocated.
2166 */
2167struct page *
2168alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2169 unsigned long addr, int node, bool hugepage)
2170{
2171 struct mempolicy *pol;
2172 struct page *page;
2173 int preferred_nid;
2174 nodemask_t *nmask;
2175
2176 pol = get_vma_policy(vma, addr);
2177
2178 if (pol->mode == MPOL_INTERLEAVE) {
2179 unsigned nid;
2180
2181 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2182 mpol_cond_put(pol);
2183 page = alloc_page_interleave(gfp, order, nid);
2184 goto out;
2185 }
2186
2187 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2188 int hpage_node = node;
2189
2190 /*
2191 * For hugepage allocation and non-interleave policy which
2192 * allows the current node (or other explicitly preferred
2193 * node) we only try to allocate from the current/preferred
2194 * node and don't fall back to other nodes, as the cost of
2195 * remote accesses would likely offset THP benefits.
2196 *
2197 * If the policy is interleave, or does not allow the current
2198 * node in its nodemask, we allocate the standard way.
2199 */
2200 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2201 hpage_node = pol->v.preferred_node;
2202
2203 nmask = policy_nodemask(gfp, pol);
2204 if (!nmask || node_isset(hpage_node, *nmask)) {
2205 mpol_cond_put(pol);
2206 /*
2207 * First, try to allocate THP only on local node, but
2208 * don't reclaim unnecessarily, just compact.
2209 */
2210 page = __alloc_pages_node(hpage_node,
2211 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2212
2213 /*
2214 * If hugepage allocations are configured to always
2215 * synchronous compact or the vma has been madvised
2216 * to prefer hugepage backing, retry allowing remote
2217 * memory with both reclaim and compact as well.
2218 */
2219 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2220 page = __alloc_pages_nodemask(gfp, order,
2221 hpage_node, nmask);
2222
2223 goto out;
2224 }
2225 }
2226
2227 nmask = policy_nodemask(gfp, pol);
2228 preferred_nid = policy_node(gfp, pol, node);
2229 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2230 mpol_cond_put(pol);
2231out:
2232 return page;
2233}
2234EXPORT_SYMBOL(alloc_pages_vma);
2235
2236/**
2237 * alloc_pages_current - Allocate pages.
2238 *
2239 * @gfp:
2240 * %GFP_USER user allocation,
2241 * %GFP_KERNEL kernel allocation,
2242 * %GFP_HIGHMEM highmem allocation,
2243 * %GFP_FS don't call back into a file system.
2244 * %GFP_ATOMIC don't sleep.
2245 * @order: Power of two of allocation size in pages. 0 is a single page.
2246 *
2247 * Allocate a page from the kernel page pool. When not in
2248 * interrupt context and apply the current process NUMA policy.
2249 * Returns NULL when no page can be allocated.
2250 */
2251struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2252{
2253 struct mempolicy *pol = &default_policy;
2254 struct page *page;
2255
2256 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2257 pol = get_task_policy(current);
2258
2259 /*
2260 * No reference counting needed for current->mempolicy
2261 * nor system default_policy
2262 */
2263 if (pol->mode == MPOL_INTERLEAVE)
2264 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2265 else
2266 page = __alloc_pages_nodemask(gfp, order,
2267 policy_node(gfp, pol, numa_node_id()),
2268 policy_nodemask(gfp, pol));
2269
2270 return page;
2271}
2272EXPORT_SYMBOL(alloc_pages_current);
2273
2274int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2275{
2276 struct mempolicy *pol = mpol_dup(vma_policy(src));
2277
2278 if (IS_ERR(pol))
2279 return PTR_ERR(pol);
2280 dst->vm_policy = pol;
2281 return 0;
2282}
2283
2284/*
2285 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2286 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2287 * with the mems_allowed returned by cpuset_mems_allowed(). This
2288 * keeps mempolicies cpuset relative after its cpuset moves. See
2289 * further kernel/cpuset.c update_nodemask().
2290 *
2291 * current's mempolicy may be rebinded by the other task(the task that changes
2292 * cpuset's mems), so we needn't do rebind work for current task.
2293 */
2294
2295/* Slow path of a mempolicy duplicate */
2296struct mempolicy *__mpol_dup(struct mempolicy *old)
2297{
2298 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2299
2300 if (!new)
2301 return ERR_PTR(-ENOMEM);
2302
2303 /* task's mempolicy is protected by alloc_lock */
2304 if (old == current->mempolicy) {
2305 task_lock(current);
2306 *new = *old;
2307 task_unlock(current);
2308 } else
2309 *new = *old;
2310
2311 if (current_cpuset_is_being_rebound()) {
2312 nodemask_t mems = cpuset_mems_allowed(current);
2313 mpol_rebind_policy(new, &mems);
2314 }
2315 atomic_set(&new->refcnt, 1);
2316 return new;
2317}
2318
2319/* Slow path of a mempolicy comparison */
2320bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2321{
2322 if (!a || !b)
2323 return false;
2324 if (a->mode != b->mode)
2325 return false;
2326 if (a->flags != b->flags)
2327 return false;
2328 if (mpol_store_user_nodemask(a))
2329 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2330 return false;
2331
2332 switch (a->mode) {
2333 case MPOL_BIND:
2334 case MPOL_INTERLEAVE:
2335 return !!nodes_equal(a->v.nodes, b->v.nodes);
2336 case MPOL_PREFERRED:
2337 /* a's ->flags is the same as b's */
2338 if (a->flags & MPOL_F_LOCAL)
2339 return true;
2340 return a->v.preferred_node == b->v.preferred_node;
2341 default:
2342 BUG();
2343 return false;
2344 }
2345}
2346
2347/*
2348 * Shared memory backing store policy support.
2349 *
2350 * Remember policies even when nobody has shared memory mapped.
2351 * The policies are kept in Red-Black tree linked from the inode.
2352 * They are protected by the sp->lock rwlock, which should be held
2353 * for any accesses to the tree.
2354 */
2355
2356/*
2357 * lookup first element intersecting start-end. Caller holds sp->lock for
2358 * reading or for writing
2359 */
2360static struct sp_node *
2361sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2362{
2363 struct rb_node *n = sp->root.rb_node;
2364
2365 while (n) {
2366 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2367
2368 if (start >= p->end)
2369 n = n->rb_right;
2370 else if (end <= p->start)
2371 n = n->rb_left;
2372 else
2373 break;
2374 }
2375 if (!n)
2376 return NULL;
2377 for (;;) {
2378 struct sp_node *w = NULL;
2379 struct rb_node *prev = rb_prev(n);
2380 if (!prev)
2381 break;
2382 w = rb_entry(prev, struct sp_node, nd);
2383 if (w->end <= start)
2384 break;
2385 n = prev;
2386 }
2387 return rb_entry(n, struct sp_node, nd);
2388}
2389
2390/*
2391 * Insert a new shared policy into the list. Caller holds sp->lock for
2392 * writing.
2393 */
2394static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2395{
2396 struct rb_node **p = &sp->root.rb_node;
2397 struct rb_node *parent = NULL;
2398 struct sp_node *nd;
2399
2400 while (*p) {
2401 parent = *p;
2402 nd = rb_entry(parent, struct sp_node, nd);
2403 if (new->start < nd->start)
2404 p = &(*p)->rb_left;
2405 else if (new->end > nd->end)
2406 p = &(*p)->rb_right;
2407 else
2408 BUG();
2409 }
2410 rb_link_node(&new->nd, parent, p);
2411 rb_insert_color(&new->nd, &sp->root);
2412 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2413 new->policy ? new->policy->mode : 0);
2414}
2415
2416/* Find shared policy intersecting idx */
2417struct mempolicy *
2418mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2419{
2420 struct mempolicy *pol = NULL;
2421 struct sp_node *sn;
2422
2423 if (!sp->root.rb_node)
2424 return NULL;
2425 read_lock(&sp->lock);
2426 sn = sp_lookup(sp, idx, idx+1);
2427 if (sn) {
2428 mpol_get(sn->policy);
2429 pol = sn->policy;
2430 }
2431 read_unlock(&sp->lock);
2432 return pol;
2433}
2434
2435static void sp_free(struct sp_node *n)
2436{
2437 mpol_put(n->policy);
2438 kmem_cache_free(sn_cache, n);
2439}
2440
2441/**
2442 * mpol_misplaced - check whether current page node is valid in policy
2443 *
2444 * @page: page to be checked
2445 * @vma: vm area where page mapped
2446 * @addr: virtual address where page mapped
2447 *
2448 * Lookup current policy node id for vma,addr and "compare to" page's
2449 * node id.
2450 *
2451 * Returns:
2452 * -1 - not misplaced, page is in the right node
2453 * node - node id where the page should be
2454 *
2455 * Policy determination "mimics" alloc_page_vma().
2456 * Called from fault path where we know the vma and faulting address.
2457 */
2458int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2459{
2460 struct mempolicy *pol;
2461 struct zoneref *z;
2462 int curnid = page_to_nid(page);
2463 unsigned long pgoff;
2464 int thiscpu = raw_smp_processor_id();
2465 int thisnid = cpu_to_node(thiscpu);
2466 int polnid = NUMA_NO_NODE;
2467 int ret = -1;
2468
2469 pol = get_vma_policy(vma, addr);
2470 if (!(pol->flags & MPOL_F_MOF))
2471 goto out;
2472
2473 switch (pol->mode) {
2474 case MPOL_INTERLEAVE:
2475 pgoff = vma->vm_pgoff;
2476 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2477 polnid = offset_il_node(pol, pgoff);
2478 break;
2479
2480 case MPOL_PREFERRED:
2481 if (pol->flags & MPOL_F_LOCAL)
2482 polnid = numa_node_id();
2483 else
2484 polnid = pol->v.preferred_node;
2485 break;
2486
2487 case MPOL_BIND:
2488
2489 /*
2490 * allows binding to multiple nodes.
2491 * use current page if in policy nodemask,
2492 * else select nearest allowed node, if any.
2493 * If no allowed nodes, use current [!misplaced].
2494 */
2495 if (node_isset(curnid, pol->v.nodes))
2496 goto out;
2497 z = first_zones_zonelist(
2498 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2499 gfp_zone(GFP_HIGHUSER),
2500 &pol->v.nodes);
2501 polnid = zone_to_nid(z->zone);
2502 break;
2503
2504 default:
2505 BUG();
2506 }
2507
2508 /* Migrate the page towards the node whose CPU is referencing it */
2509 if (pol->flags & MPOL_F_MORON) {
2510 polnid = thisnid;
2511
2512 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2513 goto out;
2514 }
2515
2516 if (curnid != polnid)
2517 ret = polnid;
2518out:
2519 mpol_cond_put(pol);
2520
2521 return ret;
2522}
2523
2524/*
2525 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2526 * dropped after task->mempolicy is set to NULL so that any allocation done as
2527 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2528 * policy.
2529 */
2530void mpol_put_task_policy(struct task_struct *task)
2531{
2532 struct mempolicy *pol;
2533
2534 task_lock(task);
2535 pol = task->mempolicy;
2536 task->mempolicy = NULL;
2537 task_unlock(task);
2538 mpol_put(pol);
2539}
2540
2541static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2542{
2543 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2544 rb_erase(&n->nd, &sp->root);
2545 sp_free(n);
2546}
2547
2548static void sp_node_init(struct sp_node *node, unsigned long start,
2549 unsigned long end, struct mempolicy *pol)
2550{
2551 node->start = start;
2552 node->end = end;
2553 node->policy = pol;
2554}
2555
2556static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2557 struct mempolicy *pol)
2558{
2559 struct sp_node *n;
2560 struct mempolicy *newpol;
2561
2562 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2563 if (!n)
2564 return NULL;
2565
2566 newpol = mpol_dup(pol);
2567 if (IS_ERR(newpol)) {
2568 kmem_cache_free(sn_cache, n);
2569 return NULL;
2570 }
2571 newpol->flags |= MPOL_F_SHARED;
2572 sp_node_init(n, start, end, newpol);
2573
2574 return n;
2575}
2576
2577/* Replace a policy range. */
2578static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2579 unsigned long end, struct sp_node *new)
2580{
2581 struct sp_node *n;
2582 struct sp_node *n_new = NULL;
2583 struct mempolicy *mpol_new = NULL;
2584 int ret = 0;
2585
2586restart:
2587 write_lock(&sp->lock);
2588 n = sp_lookup(sp, start, end);
2589 /* Take care of old policies in the same range. */
2590 while (n && n->start < end) {
2591 struct rb_node *next = rb_next(&n->nd);
2592 if (n->start >= start) {
2593 if (n->end <= end)
2594 sp_delete(sp, n);
2595 else
2596 n->start = end;
2597 } else {
2598 /* Old policy spanning whole new range. */
2599 if (n->end > end) {
2600 if (!n_new)
2601 goto alloc_new;
2602
2603 *mpol_new = *n->policy;
2604 atomic_set(&mpol_new->refcnt, 1);
2605 sp_node_init(n_new, end, n->end, mpol_new);
2606 n->end = start;
2607 sp_insert(sp, n_new);
2608 n_new = NULL;
2609 mpol_new = NULL;
2610 break;
2611 } else
2612 n->end = start;
2613 }
2614 if (!next)
2615 break;
2616 n = rb_entry(next, struct sp_node, nd);
2617 }
2618 if (new)
2619 sp_insert(sp, new);
2620 write_unlock(&sp->lock);
2621 ret = 0;
2622
2623err_out:
2624 if (mpol_new)
2625 mpol_put(mpol_new);
2626 if (n_new)
2627 kmem_cache_free(sn_cache, n_new);
2628
2629 return ret;
2630
2631alloc_new:
2632 write_unlock(&sp->lock);
2633 ret = -ENOMEM;
2634 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2635 if (!n_new)
2636 goto err_out;
2637 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2638 if (!mpol_new)
2639 goto err_out;
2640 atomic_set(&mpol_new->refcnt, 1);
2641 goto restart;
2642}
2643
2644/**
2645 * mpol_shared_policy_init - initialize shared policy for inode
2646 * @sp: pointer to inode shared policy
2647 * @mpol: struct mempolicy to install
2648 *
2649 * Install non-NULL @mpol in inode's shared policy rb-tree.
2650 * On entry, the current task has a reference on a non-NULL @mpol.
2651 * This must be released on exit.
2652 * This is called at get_inode() calls and we can use GFP_KERNEL.
2653 */
2654void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2655{
2656 int ret;
2657
2658 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2659 rwlock_init(&sp->lock);
2660
2661 if (mpol) {
2662 struct vm_area_struct pvma;
2663 struct mempolicy *new;
2664 NODEMASK_SCRATCH(scratch);
2665
2666 if (!scratch)
2667 goto put_mpol;
2668 /* contextualize the tmpfs mount point mempolicy */
2669 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2670 if (IS_ERR(new))
2671 goto free_scratch; /* no valid nodemask intersection */
2672
2673 task_lock(current);
2674 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2675 task_unlock(current);
2676 if (ret)
2677 goto put_new;
2678
2679 /* Create pseudo-vma that contains just the policy */
2680 vma_init(&pvma, NULL);
2681 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2682 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2683
2684put_new:
2685 mpol_put(new); /* drop initial ref */
2686free_scratch:
2687 NODEMASK_SCRATCH_FREE(scratch);
2688put_mpol:
2689 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2690 }
2691}
2692
2693int mpol_set_shared_policy(struct shared_policy *info,
2694 struct vm_area_struct *vma, struct mempolicy *npol)
2695{
2696 int err;
2697 struct sp_node *new = NULL;
2698 unsigned long sz = vma_pages(vma);
2699
2700 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2701 vma->vm_pgoff,
2702 sz, npol ? npol->mode : -1,
2703 npol ? npol->flags : -1,
2704 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2705
2706 if (npol) {
2707 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2708 if (!new)
2709 return -ENOMEM;
2710 }
2711 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2712 if (err && new)
2713 sp_free(new);
2714 return err;
2715}
2716
2717/* Free a backing policy store on inode delete. */
2718void mpol_free_shared_policy(struct shared_policy *p)
2719{
2720 struct sp_node *n;
2721 struct rb_node *next;
2722
2723 if (!p->root.rb_node)
2724 return;
2725 write_lock(&p->lock);
2726 next = rb_first(&p->root);
2727 while (next) {
2728 n = rb_entry(next, struct sp_node, nd);
2729 next = rb_next(&n->nd);
2730 sp_delete(p, n);
2731 }
2732 write_unlock(&p->lock);
2733}
2734
2735#ifdef CONFIG_NUMA_BALANCING
2736static int __initdata numabalancing_override;
2737
2738static void __init check_numabalancing_enable(void)
2739{
2740 bool numabalancing_default = false;
2741
2742 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2743 numabalancing_default = true;
2744
2745 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2746 if (numabalancing_override)
2747 set_numabalancing_state(numabalancing_override == 1);
2748
2749 if (num_online_nodes() > 1 && !numabalancing_override) {
2750 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2751 numabalancing_default ? "Enabling" : "Disabling");
2752 set_numabalancing_state(numabalancing_default);
2753 }
2754}
2755
2756static int __init setup_numabalancing(char *str)
2757{
2758 int ret = 0;
2759 if (!str)
2760 goto out;
2761
2762 if (!strcmp(str, "enable")) {
2763 numabalancing_override = 1;
2764 ret = 1;
2765 } else if (!strcmp(str, "disable")) {
2766 numabalancing_override = -1;
2767 ret = 1;
2768 }
2769out:
2770 if (!ret)
2771 pr_warn("Unable to parse numa_balancing=\n");
2772
2773 return ret;
2774}
2775__setup("numa_balancing=", setup_numabalancing);
2776#else
2777static inline void __init check_numabalancing_enable(void)
2778{
2779}
2780#endif /* CONFIG_NUMA_BALANCING */
2781
2782/* assumes fs == KERNEL_DS */
2783void __init numa_policy_init(void)
2784{
2785 nodemask_t interleave_nodes;
2786 unsigned long largest = 0;
2787 int nid, prefer = 0;
2788
2789 policy_cache = kmem_cache_create("numa_policy",
2790 sizeof(struct mempolicy),
2791 0, SLAB_PANIC, NULL);
2792
2793 sn_cache = kmem_cache_create("shared_policy_node",
2794 sizeof(struct sp_node),
2795 0, SLAB_PANIC, NULL);
2796
2797 for_each_node(nid) {
2798 preferred_node_policy[nid] = (struct mempolicy) {
2799 .refcnt = ATOMIC_INIT(1),
2800 .mode = MPOL_PREFERRED,
2801 .flags = MPOL_F_MOF | MPOL_F_MORON,
2802 .v = { .preferred_node = nid, },
2803 };
2804 }
2805
2806 /*
2807 * Set interleaving policy for system init. Interleaving is only
2808 * enabled across suitably sized nodes (default is >= 16MB), or
2809 * fall back to the largest node if they're all smaller.
2810 */
2811 nodes_clear(interleave_nodes);
2812 for_each_node_state(nid, N_MEMORY) {
2813 unsigned long total_pages = node_present_pages(nid);
2814
2815 /* Preserve the largest node */
2816 if (largest < total_pages) {
2817 largest = total_pages;
2818 prefer = nid;
2819 }
2820
2821 /* Interleave this node? */
2822 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2823 node_set(nid, interleave_nodes);
2824 }
2825
2826 /* All too small, use the largest */
2827 if (unlikely(nodes_empty(interleave_nodes)))
2828 node_set(prefer, interleave_nodes);
2829
2830 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2831 pr_err("%s: interleaving failed\n", __func__);
2832
2833 check_numabalancing_enable();
2834}
2835
2836/* Reset policy of current process to default */
2837void numa_default_policy(void)
2838{
2839 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2840}
2841
2842/*
2843 * Parse and format mempolicy from/to strings
2844 */
2845
2846/*
2847 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2848 */
2849static const char * const policy_modes[] =
2850{
2851 [MPOL_DEFAULT] = "default",
2852 [MPOL_PREFERRED] = "prefer",
2853 [MPOL_BIND] = "bind",
2854 [MPOL_INTERLEAVE] = "interleave",
2855 [MPOL_LOCAL] = "local",
2856};
2857
2858
2859#ifdef CONFIG_TMPFS
2860/**
2861 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2862 * @str: string containing mempolicy to parse
2863 * @mpol: pointer to struct mempolicy pointer, returned on success.
2864 *
2865 * Format of input:
2866 * <mode>[=<flags>][:<nodelist>]
2867 *
2868 * On success, returns 0, else 1
2869 */
2870int mpol_parse_str(char *str, struct mempolicy **mpol)
2871{
2872 struct mempolicy *new = NULL;
2873 unsigned short mode_flags;
2874 nodemask_t nodes;
2875 char *nodelist = strchr(str, ':');
2876 char *flags = strchr(str, '=');
2877 int err = 1, mode;
2878
2879 if (flags)
2880 *flags++ = '\0'; /* terminate mode string */
2881
2882 if (nodelist) {
2883 /* NUL-terminate mode or flags string */
2884 *nodelist++ = '\0';
2885 if (nodelist_parse(nodelist, nodes))
2886 goto out;
2887 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2888 goto out;
2889 } else
2890 nodes_clear(nodes);
2891
2892 mode = match_string(policy_modes, MPOL_MAX, str);
2893 if (mode < 0)
2894 goto out;
2895
2896 switch (mode) {
2897 case MPOL_PREFERRED:
2898 /*
2899 * Insist on a nodelist of one node only, although later
2900 * we use first_node(nodes) to grab a single node, so here
2901 * nodelist (or nodes) cannot be empty.
2902 */
2903 if (nodelist) {
2904 char *rest = nodelist;
2905 while (isdigit(*rest))
2906 rest++;
2907 if (*rest)
2908 goto out;
2909 if (nodes_empty(nodes))
2910 goto out;
2911 }
2912 break;
2913 case MPOL_INTERLEAVE:
2914 /*
2915 * Default to online nodes with memory if no nodelist
2916 */
2917 if (!nodelist)
2918 nodes = node_states[N_MEMORY];
2919 break;
2920 case MPOL_LOCAL:
2921 /*
2922 * Don't allow a nodelist; mpol_new() checks flags
2923 */
2924 if (nodelist)
2925 goto out;
2926 mode = MPOL_PREFERRED;
2927 break;
2928 case MPOL_DEFAULT:
2929 /*
2930 * Insist on a empty nodelist
2931 */
2932 if (!nodelist)
2933 err = 0;
2934 goto out;
2935 case MPOL_BIND:
2936 /*
2937 * Insist on a nodelist
2938 */
2939 if (!nodelist)
2940 goto out;
2941 }
2942
2943 mode_flags = 0;
2944 if (flags) {
2945 /*
2946 * Currently, we only support two mutually exclusive
2947 * mode flags.
2948 */
2949 if (!strcmp(flags, "static"))
2950 mode_flags |= MPOL_F_STATIC_NODES;
2951 else if (!strcmp(flags, "relative"))
2952 mode_flags |= MPOL_F_RELATIVE_NODES;
2953 else
2954 goto out;
2955 }
2956
2957 new = mpol_new(mode, mode_flags, &nodes);
2958 if (IS_ERR(new))
2959 goto out;
2960
2961 /*
2962 * Save nodes for mpol_to_str() to show the tmpfs mount options
2963 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2964 */
2965 if (mode != MPOL_PREFERRED)
2966 new->v.nodes = nodes;
2967 else if (nodelist)
2968 new->v.preferred_node = first_node(nodes);
2969 else
2970 new->flags |= MPOL_F_LOCAL;
2971
2972 /*
2973 * Save nodes for contextualization: this will be used to "clone"
2974 * the mempolicy in a specific context [cpuset] at a later time.
2975 */
2976 new->w.user_nodemask = nodes;
2977
2978 err = 0;
2979
2980out:
2981 /* Restore string for error message */
2982 if (nodelist)
2983 *--nodelist = ':';
2984 if (flags)
2985 *--flags = '=';
2986 if (!err)
2987 *mpol = new;
2988 return err;
2989}
2990#endif /* CONFIG_TMPFS */
2991
2992/**
2993 * mpol_to_str - format a mempolicy structure for printing
2994 * @buffer: to contain formatted mempolicy string
2995 * @maxlen: length of @buffer
2996 * @pol: pointer to mempolicy to be formatted
2997 *
2998 * Convert @pol into a string. If @buffer is too short, truncate the string.
2999 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3000 * longest flag, "relative", and to display at least a few node ids.
3001 */
3002void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3003{
3004 char *p = buffer;
3005 nodemask_t nodes = NODE_MASK_NONE;
3006 unsigned short mode = MPOL_DEFAULT;
3007 unsigned short flags = 0;
3008
3009 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3010 mode = pol->mode;
3011 flags = pol->flags;
3012 }
3013
3014 switch (mode) {
3015 case MPOL_DEFAULT:
3016 break;
3017 case MPOL_PREFERRED:
3018 if (flags & MPOL_F_LOCAL)
3019 mode = MPOL_LOCAL;
3020 else
3021 node_set(pol->v.preferred_node, nodes);
3022 break;
3023 case MPOL_BIND:
3024 case MPOL_INTERLEAVE:
3025 nodes = pol->v.nodes;
3026 break;
3027 default:
3028 WARN_ON_ONCE(1);
3029 snprintf(p, maxlen, "unknown");
3030 return;
3031 }
3032
3033 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3034
3035 if (flags & MPOL_MODE_FLAGS) {
3036 p += snprintf(p, buffer + maxlen - p, "=");
3037
3038 /*
3039 * Currently, the only defined flags are mutually exclusive
3040 */
3041 if (flags & MPOL_F_STATIC_NODES)
3042 p += snprintf(p, buffer + maxlen - p, "static");
3043 else if (flags & MPOL_F_RELATIVE_NODES)
3044 p += snprintf(p, buffer + maxlen - p, "relative");
3045 }
3046
3047 if (!nodes_empty(nodes))
3048 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3049 nodemask_pr_args(&nodes));
3050}