2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/gfp.h>
71 #include <linux/slab.h>
72 #include <linux/string.h>
73 #include <linux/module.h>
74 #include <linux/interrupt.h>
75 #include <linux/init.h>
76 #include <linux/compat.h>
77 #include <linux/mempolicy.h>
78 #include <asm/tlbflush.h>
79 #include <asm/uaccess.h>
81 static kmem_cache_t *policy_cache;
82 static kmem_cache_t *sn_cache;
84 #define PDprintk(fmt...)
86 /* Highest zone. An specific allocation for a zone below that is not
88 static int policy_zone;
90 static struct mempolicy default_policy = {
91 .refcnt = ATOMIC_INIT(1), /* never free it */
92 .policy = MPOL_DEFAULT,
95 /* Check if all specified nodes are online */
96 static int nodes_online(unsigned long *nodes)
98 DECLARE_BITMAP(online2, MAX_NUMNODES);
100 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
101 if (bitmap_empty(online2, MAX_NUMNODES))
103 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
108 /* Do sanity checking on a policy */
109 static int mpol_check_policy(int mode, unsigned long *nodes)
111 int empty = bitmap_empty(nodes, MAX_NUMNODES);
119 case MPOL_INTERLEAVE:
120 /* Preferred will only use the first bit, but allow
126 return nodes_online(nodes);
129 /* Copy a node mask from user space. */
130 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
131 unsigned long maxnode, int mode)
134 unsigned long nlongs;
135 unsigned long endmask;
138 bitmap_zero(nodes, MAX_NUMNODES);
139 if (maxnode == 0 || !nmask)
142 nlongs = BITS_TO_LONGS(maxnode);
143 if ((maxnode % BITS_PER_LONG) == 0)
146 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148 /* When the user specified more nodes than supported just check
149 if the non supported part is all zero. */
150 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
151 if (nlongs > PAGE_SIZE/sizeof(long))
153 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 if (get_user(t, nmask + k))
157 if (k == nlongs - 1) {
163 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
167 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 nodes[nlongs-1] &= endmask;
170 return mpol_check_policy(mode, nodes);
173 /* Generate a custom zonelist for the BIND policy. */
174 static struct zonelist *bind_zonelist(unsigned long *nodes)
179 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
180 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
184 for (nd = find_first_bit(nodes, MAX_NUMNODES);
186 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
188 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
189 struct zone *z = &NODE_DATA(nd)->node_zones[k];
190 if (!z->present_pages)
192 zl->zones[num++] = z;
198 zl->zones[num] = NULL;
202 /* Create a new policy */
203 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
205 struct mempolicy *policy;
207 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
208 if (mode == MPOL_DEFAULT)
210 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
212 return ERR_PTR(-ENOMEM);
213 atomic_set(&policy->refcnt, 1);
215 case MPOL_INTERLEAVE:
216 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
219 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
220 if (policy->v.preferred_node >= MAX_NUMNODES)
221 policy->v.preferred_node = -1;
224 policy->v.zonelist = bind_zonelist(nodes);
225 if (policy->v.zonelist == NULL) {
226 kmem_cache_free(policy_cache, policy);
227 return ERR_PTR(-ENOMEM);
231 policy->policy = mode;
235 /* Ensure all existing pages follow the policy. */
237 verify_pages(struct mm_struct *mm,
238 unsigned long addr, unsigned long end, unsigned long *nodes)
246 pgd = pgd_offset(mm, addr);
247 if (pgd_none(*pgd)) {
248 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
254 pud = pud_offset(pgd, addr);
255 if (pud_none(*pud)) {
256 addr = (addr + PUD_SIZE) & PUD_MASK;
259 pmd = pmd_offset(pud, addr);
260 if (pmd_none(*pmd)) {
261 addr = (addr + PMD_SIZE) & PMD_MASK;
265 pte = pte_offset_map(pmd, addr);
266 if (pte_present(*pte))
270 unsigned nid = page_to_nid(p);
271 if (!test_bit(nid, nodes))
279 /* Step 1: check the range */
280 static struct vm_area_struct *
281 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
282 unsigned long *nodes, unsigned long flags)
285 struct vm_area_struct *first, *vma, *prev;
287 first = find_vma(mm, start);
289 return ERR_PTR(-EFAULT);
291 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
292 if (!vma->vm_next && vma->vm_end < end)
293 return ERR_PTR(-EFAULT);
294 if (prev && prev->vm_end < vma->vm_start)
295 return ERR_PTR(-EFAULT);
296 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
297 err = verify_pages(vma->vm_mm,
298 vma->vm_start, vma->vm_end, nodes);
300 first = ERR_PTR(err);
309 /* Apply policy to a single VMA */
310 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
313 struct mempolicy *old = vma->vm_policy;
315 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
316 vma->vm_start, vma->vm_end, vma->vm_pgoff,
317 vma->vm_ops, vma->vm_file,
318 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
320 if (vma->vm_ops && vma->vm_ops->set_policy)
321 err = vma->vm_ops->set_policy(vma, new);
324 vma->vm_policy = new;
330 /* Step 2: apply policy to a range and do splits. */
331 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
332 unsigned long end, struct mempolicy *new)
334 struct vm_area_struct *next;
338 for (; vma && vma->vm_start < end; vma = next) {
340 if (vma->vm_start < start)
341 err = split_vma(vma->vm_mm, vma, start, 1);
342 if (!err && vma->vm_end > end)
343 err = split_vma(vma->vm_mm, vma, end, 0);
345 err = policy_vma(vma, new);
352 /* Change policy for a memory range */
353 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
355 unsigned long __user *nmask, unsigned long maxnode,
358 struct vm_area_struct *vma;
359 struct mm_struct *mm = current->mm;
360 struct mempolicy *new;
362 DECLARE_BITMAP(nodes, MAX_NUMNODES);
365 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
367 if (start & ~PAGE_MASK)
369 if (mode == MPOL_DEFAULT)
370 flags &= ~MPOL_MF_STRICT;
371 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
378 err = get_nodes(nodes, nmask, maxnode, mode);
382 new = mpol_new(mode, nodes);
386 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
389 down_write(&mm->mmap_sem);
390 vma = check_range(mm, start, end, nodes, flags);
393 err = mbind_range(vma, start, end, new);
394 up_write(&mm->mmap_sem);
399 /* Set the process memory policy */
400 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
401 unsigned long maxnode)
404 struct mempolicy *new;
405 DECLARE_BITMAP(nodes, MAX_NUMNODES);
409 err = get_nodes(nodes, nmask, maxnode, mode);
412 new = mpol_new(mode, nodes);
415 mpol_free(current->mempolicy);
416 current->mempolicy = new;
417 if (new && new->policy == MPOL_INTERLEAVE)
418 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
422 /* Fill a zone bitmap for a policy */
423 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
427 bitmap_zero(nodes, MAX_NUMNODES);
430 for (i = 0; p->v.zonelist->zones[i]; i++)
431 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
435 case MPOL_INTERLEAVE:
436 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
439 /* or use current node instead of online map? */
440 if (p->v.preferred_node < 0)
441 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
443 __set_bit(p->v.preferred_node, nodes);
450 static int lookup_node(struct mm_struct *mm, unsigned long addr)
455 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
457 err = page_to_nid(p);
463 /* Copy a kernel node mask to user space */
464 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
465 void *nodes, unsigned nbytes)
467 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
470 if (copy > PAGE_SIZE)
472 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
476 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
479 /* Retrieve NUMA policy */
480 asmlinkage long sys_get_mempolicy(int __user *policy,
481 unsigned long __user *nmask,
482 unsigned long maxnode,
483 unsigned long addr, unsigned long flags)
486 struct mm_struct *mm = current->mm;
487 struct vm_area_struct *vma = NULL;
488 struct mempolicy *pol = current->mempolicy;
490 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
492 if (nmask != NULL && maxnode < MAX_NUMNODES)
494 if (flags & MPOL_F_ADDR) {
495 down_read(&mm->mmap_sem);
496 vma = find_vma_intersection(mm, addr, addr+1);
498 up_read(&mm->mmap_sem);
501 if (vma->vm_ops && vma->vm_ops->get_policy)
502 pol = vma->vm_ops->get_policy(vma, addr);
504 pol = vma->vm_policy;
509 pol = &default_policy;
511 if (flags & MPOL_F_NODE) {
512 if (flags & MPOL_F_ADDR) {
513 err = lookup_node(mm, addr);
517 } else if (pol == current->mempolicy &&
518 pol->policy == MPOL_INTERLEAVE) {
519 pval = current->il_next;
528 up_read(¤t->mm->mmap_sem);
532 if (policy && put_user(pval, policy))
537 DECLARE_BITMAP(nodes, MAX_NUMNODES);
538 get_zonemask(pol, nodes);
539 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
544 up_read(¤t->mm->mmap_sem);
550 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
551 compat_ulong_t __user *nmask,
552 compat_ulong_t maxnode,
553 compat_ulong_t addr, compat_ulong_t flags)
556 unsigned long __user *nm = NULL;
557 unsigned long nr_bits, alloc_size;
558 DECLARE_BITMAP(bm, MAX_NUMNODES);
560 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
561 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
564 nm = compat_alloc_user_space(alloc_size);
566 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
569 err = copy_from_user(bm, nm, alloc_size);
570 /* ensure entire bitmap is zeroed */
571 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
572 err |= compat_put_bitmap(nmask, bm, nr_bits);
578 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
579 compat_ulong_t maxnode)
582 unsigned long __user *nm = NULL;
583 unsigned long nr_bits, alloc_size;
584 DECLARE_BITMAP(bm, MAX_NUMNODES);
586 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
587 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
590 err = compat_get_bitmap(bm, nmask, nr_bits);
591 nm = compat_alloc_user_space(alloc_size);
592 err |= copy_to_user(nm, bm, alloc_size);
598 return sys_set_mempolicy(mode, nm, nr_bits+1);
601 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
602 compat_ulong_t mode, compat_ulong_t __user *nmask,
603 compat_ulong_t maxnode, compat_ulong_t flags)
606 unsigned long __user *nm = NULL;
607 unsigned long nr_bits, alloc_size;
608 DECLARE_BITMAP(bm, MAX_NUMNODES);
610 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
611 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
614 err = compat_get_bitmap(bm, nmask, nr_bits);
615 nm = compat_alloc_user_space(alloc_size);
616 err |= copy_to_user(nm, bm, alloc_size);
622 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
627 /* Return effective policy for a VMA */
628 static struct mempolicy *
629 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
631 struct mempolicy *pol = current->mempolicy;
634 if (vma->vm_ops && vma->vm_ops->get_policy)
635 pol = vma->vm_ops->get_policy(vma, addr);
636 else if (vma->vm_policy &&
637 vma->vm_policy->policy != MPOL_DEFAULT)
638 pol = vma->vm_policy;
641 pol = &default_policy;
645 /* Return a zonelist representing a mempolicy */
646 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
650 switch (policy->policy) {
652 nd = policy->v.preferred_node;
657 /* Lower zones don't get a policy applied */
658 if (gfp >= policy_zone)
659 return policy->v.zonelist;
661 case MPOL_INTERLEAVE: /* should not happen */
669 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
672 /* Do dynamic interleaving for a process */
673 static unsigned interleave_nodes(struct mempolicy *policy)
676 struct task_struct *me = current;
679 BUG_ON(nid >= MAX_NUMNODES);
680 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
681 if (next >= MAX_NUMNODES)
682 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
687 /* Do static interleaving for a VMA with known offset. */
688 static unsigned offset_il_node(struct mempolicy *pol,
689 struct vm_area_struct *vma, unsigned long off)
691 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
692 unsigned target = (unsigned)off % nnodes;
698 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
700 } while (c <= target);
701 BUG_ON(nid >= MAX_NUMNODES);
702 BUG_ON(!test_bit(nid, pol->v.nodes));
706 /* Allocate a page in interleaved policy.
707 Own path because it needs to do special accounting. */
708 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
713 BUG_ON(!node_online(nid));
714 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
715 page = __alloc_pages(gfp, order, zl);
716 if (page && page_zone(page) == zl->zones[0]) {
717 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
724 * alloc_page_vma - Allocate a page for a VMA.
727 * %GFP_USER user allocation.
728 * %GFP_KERNEL kernel allocations,
729 * %GFP_HIGHMEM highmem/user allocations,
730 * %GFP_FS allocation should not call back into a file system.
731 * %GFP_ATOMIC don't sleep.
733 * @vma: Pointer to VMA or NULL if not available.
734 * @addr: Virtual Address of the allocation. Must be inside the VMA.
736 * This function allocates a page from the kernel page pool and applies
737 * a NUMA policy associated with the VMA or the current process.
738 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
739 * mm_struct of the VMA to prevent it from going away. Should be used for
740 * all allocations for pages that will be mapped into
741 * user space. Returns NULL when no page can be allocated.
743 * Should be called with the mm_sem of the vma hold.
746 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
748 struct mempolicy *pol = get_vma_policy(vma, addr);
750 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
754 BUG_ON(addr >= vma->vm_end);
755 BUG_ON(addr < vma->vm_start);
757 off += (addr - vma->vm_start) >> PAGE_SHIFT;
758 nid = offset_il_node(pol, vma, off);
760 /* fall back to process interleaving */
761 nid = interleave_nodes(pol);
763 return alloc_page_interleave(gfp, 0, nid);
765 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
769 * alloc_pages_current - Allocate pages.
772 * %GFP_USER user allocation,
773 * %GFP_KERNEL kernel allocation,
774 * %GFP_HIGHMEM highmem allocation,
775 * %GFP_FS don't call back into a file system.
776 * %GFP_ATOMIC don't sleep.
777 * @order: Power of two of allocation size in pages. 0 is a single page.
779 * Allocate a page from the kernel page pool. When not in
780 * interrupt context and apply the current process NUMA policy.
781 * Returns NULL when no page can be allocated.
783 struct page *alloc_pages_current(unsigned gfp, unsigned order)
785 struct mempolicy *pol = current->mempolicy;
787 if (!pol || in_interrupt())
788 pol = &default_policy;
789 if (pol->policy == MPOL_INTERLEAVE)
790 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
791 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
793 EXPORT_SYMBOL(alloc_pages_current);
795 /* Slow path of a mempolicy copy */
796 struct mempolicy *__mpol_copy(struct mempolicy *old)
798 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
801 return ERR_PTR(-ENOMEM);
803 atomic_set(&new->refcnt, 1);
804 if (new->policy == MPOL_BIND) {
805 int sz = ksize(old->v.zonelist);
806 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
807 if (!new->v.zonelist) {
808 kmem_cache_free(policy_cache, new);
809 return ERR_PTR(-ENOMEM);
811 memcpy(new->v.zonelist, old->v.zonelist, sz);
816 /* Slow path of a mempolicy comparison */
817 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
821 if (a->policy != b->policy)
826 case MPOL_INTERLEAVE:
827 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
829 return a->v.preferred_node == b->v.preferred_node;
832 for (i = 0; a->v.zonelist->zones[i]; i++)
833 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
835 return b->v.zonelist->zones[i] == NULL;
843 /* Slow path of a mpol destructor. */
844 void __mpol_free(struct mempolicy *p)
846 if (!atomic_dec_and_test(&p->refcnt))
848 if (p->policy == MPOL_BIND)
849 kfree(p->v.zonelist);
850 p->policy = MPOL_DEFAULT;
851 kmem_cache_free(policy_cache, p);
855 * Hugetlb policy. Same as above, just works with node numbers instead of
859 /* Find first node suitable for an allocation */
860 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
862 struct mempolicy *pol = get_vma_policy(vma, addr);
864 switch (pol->policy) {
866 return numa_node_id();
868 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
869 case MPOL_INTERLEAVE:
870 return interleave_nodes(pol);
872 return pol->v.preferred_node >= 0 ?
873 pol->v.preferred_node : numa_node_id();
879 /* Find secondary valid nodes for an allocation */
880 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
882 struct mempolicy *pol = get_vma_policy(vma, addr);
884 switch (pol->policy) {
887 case MPOL_INTERLEAVE:
891 for (z = pol->v.zonelist->zones; *z; z++)
892 if ((*z)->zone_pgdat->node_id == nid)
903 * Shared memory backing store policy support.
905 * Remember policies even when nobody has shared memory mapped.
906 * The policies are kept in Red-Black tree linked from the inode.
907 * They are protected by the sp->lock spinlock, which should be held
908 * for any accesses to the tree.
911 /* lookup first element intersecting start-end */
912 /* Caller holds sp->lock */
913 static struct sp_node *
914 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
916 struct rb_node *n = sp->root.rb_node;
919 struct sp_node *p = rb_entry(n, struct sp_node, nd);
923 else if (end <= p->start)
931 struct sp_node *w = NULL;
932 struct rb_node *prev = rb_prev(n);
935 w = rb_entry(prev, struct sp_node, nd);
940 return rb_entry(n, struct sp_node, nd);
943 /* Insert a new shared policy into the list. */
944 /* Caller holds sp->lock */
945 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
947 struct rb_node **p = &sp->root.rb_node;
948 struct rb_node *parent = NULL;
953 nd = rb_entry(parent, struct sp_node, nd);
954 if (new->start < nd->start)
956 else if (new->end > nd->end)
961 rb_link_node(&new->nd, parent, p);
962 rb_insert_color(&new->nd, &sp->root);
963 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
964 new->policy ? new->policy->policy : 0);
967 /* Find shared policy intersecting idx */
969 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
971 struct mempolicy *pol = NULL;
974 if (!sp->root.rb_node)
976 spin_lock(&sp->lock);
977 sn = sp_lookup(sp, idx, idx+1);
979 mpol_get(sn->policy);
982 spin_unlock(&sp->lock);
986 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
988 PDprintk("deleting %lx-l%x\n", n->start, n->end);
989 rb_erase(&n->nd, &sp->root);
990 mpol_free(n->policy);
991 kmem_cache_free(sn_cache, n);
995 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
997 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1008 /* Replace a policy range. */
1009 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1010 unsigned long end, struct sp_node *new)
1012 struct sp_node *n, *new2 = NULL;
1015 spin_lock(&sp->lock);
1016 n = sp_lookup(sp, start, end);
1017 /* Take care of old policies in the same range. */
1018 while (n && n->start < end) {
1019 struct rb_node *next = rb_next(&n->nd);
1020 if (n->start >= start) {
1026 /* Old policy spanning whole new range. */
1029 spin_unlock(&sp->lock);
1030 new2 = sp_alloc(end, n->end, n->policy);
1036 sp_insert(sp, new2);
1044 n = rb_entry(next, struct sp_node, nd);
1048 spin_unlock(&sp->lock);
1050 mpol_free(new2->policy);
1051 kmem_cache_free(sn_cache, new2);
1056 int mpol_set_shared_policy(struct shared_policy *info,
1057 struct vm_area_struct *vma, struct mempolicy *npol)
1060 struct sp_node *new = NULL;
1061 unsigned long sz = vma_pages(vma);
1063 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1065 sz, npol? npol->policy : -1,
1066 npol ? npol->v.nodes[0] : -1);
1069 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1073 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1075 kmem_cache_free(sn_cache, new);
1079 /* Free a backing policy store on inode delete. */
1080 void mpol_free_shared_policy(struct shared_policy *p)
1083 struct rb_node *next;
1085 if (!p->root.rb_node)
1087 spin_lock(&p->lock);
1088 next = rb_first(&p->root);
1090 n = rb_entry(next, struct sp_node, nd);
1091 next = rb_next(&n->nd);
1092 mpol_free(n->policy);
1093 kmem_cache_free(sn_cache, n);
1095 spin_unlock(&p->lock);
1099 /* assumes fs == KERNEL_DS */
1100 void __init numa_policy_init(void)
1102 policy_cache = kmem_cache_create("numa_policy",
1103 sizeof(struct mempolicy),
1104 0, SLAB_PANIC, NULL, NULL);
1106 sn_cache = kmem_cache_create("shared_policy_node",
1107 sizeof(struct sp_node),
1108 0, SLAB_PANIC, NULL, NULL);
1110 /* Set interleaving policy for system init. This way not all
1111 the data structures allocated at system boot end up in node zero. */
1113 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1115 printk("numa_policy_init: interleaving failed\n");
1118 /* Reset policy of current process to default.
1119 * Assumes fs == KERNEL_DS */
1120 void numa_default_policy(void)
1122 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);