2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/gfp.h>
71 #include <linux/slab.h>
72 #include <linux/string.h>
73 #include <linux/module.h>
74 #include <linux/interrupt.h>
75 #include <linux/init.h>
76 #include <linux/compat.h>
77 #include <linux/mempolicy.h>
78 #include <asm/tlbflush.h>
79 #include <asm/uaccess.h>
81 static kmem_cache_t *policy_cache;
82 static kmem_cache_t *sn_cache;
84 #define PDprintk(fmt...)
86 /* Highest zone. An specific allocation for a zone below that is not
88 static int policy_zone;
90 static struct mempolicy default_policy = {
91 .refcnt = ATOMIC_INIT(1), /* never free it */
92 .policy = MPOL_DEFAULT,
95 /* Check if all specified nodes are online */
96 static int nodes_online(unsigned long *nodes)
98 DECLARE_BITMAP(online2, MAX_NUMNODES);
100 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
101 if (bitmap_empty(online2, MAX_NUMNODES))
103 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
108 /* Do sanity checking on a policy */
109 static int mpol_check_policy(int mode, unsigned long *nodes)
111 int empty = bitmap_empty(nodes, MAX_NUMNODES);
119 case MPOL_INTERLEAVE:
120 /* Preferred will only use the first bit, but allow
126 return nodes_online(nodes);
129 /* Copy a node mask from user space. */
130 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
131 unsigned long maxnode, int mode)
134 unsigned long nlongs;
135 unsigned long endmask;
138 bitmap_zero(nodes, MAX_NUMNODES);
139 if (maxnode == 0 || !nmask)
142 nlongs = BITS_TO_LONGS(maxnode);
143 if ((maxnode % BITS_PER_LONG) == 0)
146 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148 /* When the user specified more nodes than supported just check
149 if the non supported part is all zero. */
150 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
151 if (nlongs > PAGE_SIZE/sizeof(long))
153 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 if (get_user(t, nmask + k))
157 if (k == nlongs - 1) {
163 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
167 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 nodes[nlongs-1] &= endmask;
170 return mpol_check_policy(mode, nodes);
173 /* Generate a custom zonelist for the BIND policy. */
174 static struct zonelist *bind_zonelist(unsigned long *nodes)
179 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
180 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
184 for (nd = find_first_bit(nodes, MAX_NUMNODES);
186 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
188 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
189 struct zone *z = &NODE_DATA(nd)->node_zones[k];
190 if (!z->present_pages)
192 zl->zones[num++] = z;
198 zl->zones[num] = NULL;
202 /* Create a new policy */
203 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
205 struct mempolicy *policy;
207 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
208 if (mode == MPOL_DEFAULT)
210 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
212 return ERR_PTR(-ENOMEM);
213 atomic_set(&policy->refcnt, 1);
215 case MPOL_INTERLEAVE:
216 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
219 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
220 if (policy->v.preferred_node >= MAX_NUMNODES)
221 policy->v.preferred_node = -1;
224 policy->v.zonelist = bind_zonelist(nodes);
225 if (policy->v.zonelist == NULL) {
226 kmem_cache_free(policy_cache, policy);
227 return ERR_PTR(-ENOMEM);
231 policy->policy = mode;
235 /* Ensure all existing pages follow the policy. */
237 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
243 pgd_t *pgd = pgd_offset_k(addr);
244 if (pgd_none(*pgd)) {
245 addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
248 pmd = pmd_offset(pgd, addr);
249 if (pmd_none(*pmd)) {
250 addr = (addr + PMD_SIZE) & PMD_MASK;
254 pte = pte_offset_map(pmd, addr);
255 if (pte_present(*pte))
259 unsigned nid = page_to_nid(p);
260 if (!test_bit(nid, nodes))
268 /* Step 1: check the range */
269 static struct vm_area_struct *
270 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
271 unsigned long *nodes, unsigned long flags)
274 struct vm_area_struct *first, *vma, *prev;
276 first = find_vma(mm, start);
278 return ERR_PTR(-EFAULT);
280 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
281 if (!vma->vm_next && vma->vm_end < end)
282 return ERR_PTR(-EFAULT);
283 if (prev && prev->vm_end < vma->vm_start)
284 return ERR_PTR(-EFAULT);
285 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
286 err = verify_pages(vma->vm_start, vma->vm_end, nodes);
288 first = ERR_PTR(err);
297 /* Apply policy to a single VMA */
298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
301 struct mempolicy *old = vma->vm_policy;
303 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304 vma->vm_start, vma->vm_end, vma->vm_pgoff,
305 vma->vm_ops, vma->vm_file,
306 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
308 if (vma->vm_ops && vma->vm_ops->set_policy)
309 err = vma->vm_ops->set_policy(vma, new);
312 vma->vm_policy = new;
318 /* Step 2: apply policy to a range and do splits. */
319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
320 unsigned long end, struct mempolicy *new)
322 struct vm_area_struct *next;
326 for (; vma && vma->vm_start < end; vma = next) {
328 if (vma->vm_start < start)
329 err = split_vma(vma->vm_mm, vma, start, 1);
330 if (!err && vma->vm_end > end)
331 err = split_vma(vma->vm_mm, vma, end, 0);
333 err = policy_vma(vma, new);
340 /* Change policy for a memory range */
341 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
343 unsigned long __user *nmask, unsigned long maxnode,
346 struct vm_area_struct *vma;
347 struct mm_struct *mm = current->mm;
348 struct mempolicy *new;
350 DECLARE_BITMAP(nodes, MAX_NUMNODES);
353 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
355 if (start & ~PAGE_MASK)
357 if (mode == MPOL_DEFAULT)
358 flags &= ~MPOL_MF_STRICT;
359 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
366 err = get_nodes(nodes, nmask, maxnode, mode);
370 new = mpol_new(mode, nodes);
374 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
377 down_write(&mm->mmap_sem);
378 vma = check_range(mm, start, end, nodes, flags);
381 err = mbind_range(vma, start, end, new);
382 up_write(&mm->mmap_sem);
387 /* Set the process memory policy */
388 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
389 unsigned long maxnode)
392 struct mempolicy *new;
393 DECLARE_BITMAP(nodes, MAX_NUMNODES);
397 err = get_nodes(nodes, nmask, maxnode, mode);
400 new = mpol_new(mode, nodes);
403 mpol_free(current->mempolicy);
404 current->mempolicy = new;
405 if (new && new->policy == MPOL_INTERLEAVE)
406 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
410 /* Fill a zone bitmap for a policy */
411 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
415 bitmap_zero(nodes, MAX_NUMNODES);
418 for (i = 0; p->v.zonelist->zones[i]; i++)
419 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
423 case MPOL_INTERLEAVE:
424 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
427 /* or use current node instead of online map? */
428 if (p->v.preferred_node < 0)
429 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
431 __set_bit(p->v.preferred_node, nodes);
438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
443 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
445 err = page_to_nid(p);
451 /* Copy a kernel node mask to user space */
452 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
453 void *nodes, unsigned nbytes)
455 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
458 if (copy > PAGE_SIZE)
460 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
464 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
467 /* Retrieve NUMA policy */
468 asmlinkage long sys_get_mempolicy(int __user *policy,
469 unsigned long __user *nmask,
470 unsigned long maxnode,
471 unsigned long addr, unsigned long flags)
474 struct mm_struct *mm = current->mm;
475 struct vm_area_struct *vma = NULL;
476 struct mempolicy *pol = current->mempolicy;
478 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
480 if (nmask != NULL && maxnode < numnodes)
482 if (flags & MPOL_F_ADDR) {
483 down_read(&mm->mmap_sem);
484 vma = find_vma_intersection(mm, addr, addr+1);
486 up_read(&mm->mmap_sem);
489 if (vma->vm_ops && vma->vm_ops->get_policy)
490 pol = vma->vm_ops->get_policy(vma, addr);
492 pol = vma->vm_policy;
497 pol = &default_policy;
499 if (flags & MPOL_F_NODE) {
500 if (flags & MPOL_F_ADDR) {
501 err = lookup_node(mm, addr);
505 } else if (pol == current->mempolicy &&
506 pol->policy == MPOL_INTERLEAVE) {
507 pval = current->il_next;
516 if (policy && put_user(pval, policy))
521 DECLARE_BITMAP(nodes, MAX_NUMNODES);
522 get_zonemask(pol, nodes);
523 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
528 up_read(¤t->mm->mmap_sem);
534 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
535 compat_ulong_t __user *nmask,
536 compat_ulong_t maxnode,
537 compat_ulong_t addr, compat_ulong_t flags)
540 unsigned long __user *nm = NULL;
541 unsigned long nr_bits, alloc_size;
542 DECLARE_BITMAP(bm, MAX_NUMNODES);
544 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
545 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
548 nm = compat_alloc_user_space(alloc_size);
550 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
553 err = copy_from_user(bm, nm, alloc_size);
554 /* ensure entire bitmap is zeroed */
555 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
556 err |= compat_put_bitmap(nmask, bm, nr_bits);
562 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
563 compat_ulong_t maxnode)
566 unsigned long __user *nm = NULL;
567 unsigned long nr_bits, alloc_size;
568 DECLARE_BITMAP(bm, MAX_NUMNODES);
570 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
571 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
574 err = compat_get_bitmap(bm, nmask, nr_bits);
575 nm = compat_alloc_user_space(alloc_size);
576 err |= copy_to_user(nm, bm, alloc_size);
582 return sys_set_mempolicy(mode, nm, nr_bits+1);
585 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
586 compat_ulong_t mode, compat_ulong_t __user *nmask,
587 compat_ulong_t maxnode, compat_ulong_t flags)
590 unsigned long __user *nm = NULL;
591 unsigned long nr_bits, alloc_size;
592 DECLARE_BITMAP(bm, MAX_NUMNODES);
594 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
595 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
598 err = compat_get_bitmap(bm, nmask, nr_bits);
599 nm = compat_alloc_user_space(alloc_size);
600 err |= copy_to_user(nm, bm, alloc_size);
606 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
611 /* Return effective policy for a VMA */
612 static struct mempolicy *
613 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
615 struct mempolicy *pol = current->mempolicy;
618 if (vma->vm_ops && vma->vm_ops->get_policy)
619 pol = vma->vm_ops->get_policy(vma, addr);
620 else if (vma->vm_policy &&
621 vma->vm_policy->policy != MPOL_DEFAULT)
622 pol = vma->vm_policy;
625 pol = &default_policy;
629 /* Return a zonelist representing a mempolicy */
630 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
634 switch (policy->policy) {
636 nd = policy->v.preferred_node;
641 /* Lower zones don't get a policy applied */
642 if (gfp >= policy_zone)
643 return policy->v.zonelist;
645 case MPOL_INTERLEAVE: /* should not happen */
653 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
656 /* Do dynamic interleaving for a process */
657 static unsigned interleave_nodes(struct mempolicy *policy)
660 struct task_struct *me = current;
663 BUG_ON(nid >= MAX_NUMNODES);
664 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
665 if (next >= MAX_NUMNODES)
666 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
671 /* Do static interleaving for a VMA with known offset. */
672 static unsigned offset_il_node(struct mempolicy *pol,
673 struct vm_area_struct *vma, unsigned long off)
675 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
676 unsigned target = (unsigned)off % nnodes;
682 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
684 } while (c <= target);
685 BUG_ON(nid >= MAX_NUMNODES);
686 BUG_ON(!test_bit(nid, pol->v.nodes));
690 /* Allocate a page in interleaved policy.
691 Own path because it needs to do special accounting. */
692 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
697 BUG_ON(!node_online(nid));
698 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
699 page = __alloc_pages(gfp, order, zl);
700 if (page && page_zone(page) == zl->zones[0]) {
701 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
708 * alloc_page_vma - Allocate a page for a VMA.
711 * %GFP_USER user allocation.
712 * %GFP_KERNEL kernel allocations,
713 * %GFP_HIGHMEM highmem/user allocations,
714 * %GFP_FS allocation should not call back into a file system.
715 * %GFP_ATOMIC don't sleep.
717 * @vma: Pointer to VMA or NULL if not available.
718 * @addr: Virtual Address of the allocation. Must be inside the VMA.
720 * This function allocates a page from the kernel page pool and applies
721 * a NUMA policy associated with the VMA or the current process.
722 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
723 * mm_struct of the VMA to prevent it from going away. Should be used for
724 * all allocations for pages that will be mapped into
725 * user space. Returns NULL when no page can be allocated.
727 * Should be called with the mm_sem of the vma hold.
730 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
732 struct mempolicy *pol = get_vma_policy(vma, addr);
734 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
738 BUG_ON(addr >= vma->vm_end);
739 BUG_ON(addr < vma->vm_start);
741 off += (addr - vma->vm_start) >> PAGE_SHIFT;
742 nid = offset_il_node(pol, vma, off);
744 /* fall back to process interleaving */
745 nid = interleave_nodes(pol);
747 return alloc_page_interleave(gfp, 0, nid);
749 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
753 * alloc_pages_current - Allocate pages.
756 * %GFP_USER user allocation,
757 * %GFP_KERNEL kernel allocation,
758 * %GFP_HIGHMEM highmem allocation,
759 * %GFP_FS don't call back into a file system.
760 * %GFP_ATOMIC don't sleep.
761 * @order: Power of two of allocation size in pages. 0 is a single page.
763 * Allocate a page from the kernel page pool. When not in
764 * interrupt context and apply the current process NUMA policy.
765 * Returns NULL when no page can be allocated.
767 struct page *alloc_pages_current(unsigned gfp, unsigned order)
769 struct mempolicy *pol = current->mempolicy;
771 if (!pol || in_interrupt())
772 pol = &default_policy;
773 if (pol->policy == MPOL_INTERLEAVE)
774 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
775 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
777 EXPORT_SYMBOL(alloc_pages_current);
779 /* Slow path of a mempolicy copy */
780 struct mempolicy *__mpol_copy(struct mempolicy *old)
782 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
785 return ERR_PTR(-ENOMEM);
787 atomic_set(&new->refcnt, 1);
788 if (new->policy == MPOL_BIND) {
789 int sz = ksize(old->v.zonelist);
790 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
791 if (!new->v.zonelist) {
792 kmem_cache_free(policy_cache, new);
793 return ERR_PTR(-ENOMEM);
795 memcpy(new->v.zonelist, old->v.zonelist, sz);
800 /* Slow path of a mempolicy comparison */
801 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
805 if (a->policy != b->policy)
810 case MPOL_INTERLEAVE:
811 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
813 return a->v.preferred_node == b->v.preferred_node;
816 for (i = 0; a->v.zonelist->zones[i]; i++)
817 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
819 return b->v.zonelist->zones[i] == NULL;
827 /* Slow path of a mpol destructor. */
828 void __mpol_free(struct mempolicy *p)
830 if (!atomic_dec_and_test(&p->refcnt))
832 if (p->policy == MPOL_BIND)
833 kfree(p->v.zonelist);
834 p->policy = MPOL_DEFAULT;
835 kmem_cache_free(policy_cache, p);
839 * Hugetlb policy. Same as above, just works with node numbers instead of
843 /* Find first node suitable for an allocation */
844 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
846 struct mempolicy *pol = get_vma_policy(vma, addr);
848 switch (pol->policy) {
850 return numa_node_id();
852 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
853 case MPOL_INTERLEAVE:
854 return interleave_nodes(pol);
856 return pol->v.preferred_node >= 0 ?
857 pol->v.preferred_node : numa_node_id();
863 /* Find secondary valid nodes for an allocation */
864 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
866 struct mempolicy *pol = get_vma_policy(vma, addr);
868 switch (pol->policy) {
871 case MPOL_INTERLEAVE:
875 for (z = pol->v.zonelist->zones; *z; z++)
876 if ((*z)->zone_pgdat->node_id == nid)
887 * Shared memory backing store policy support.
889 * Remember policies even when nobody has shared memory mapped.
890 * The policies are kept in Red-Black tree linked from the inode.
891 * They are protected by the sp->lock spinlock, which should be held
892 * for any accesses to the tree.
895 /* lookup first element intersecting start-end */
896 /* Caller holds sp->lock */
897 static struct sp_node *
898 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
900 struct rb_node *n = sp->root.rb_node;
903 struct sp_node *p = rb_entry(n, struct sp_node, nd);
907 else if (end <= p->start)
915 struct sp_node *w = NULL;
916 struct rb_node *prev = rb_prev(n);
919 w = rb_entry(prev, struct sp_node, nd);
924 return rb_entry(n, struct sp_node, nd);
927 /* Insert a new shared policy into the list. */
928 /* Caller holds sp->lock */
929 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
931 struct rb_node **p = &sp->root.rb_node;
932 struct rb_node *parent = NULL;
937 nd = rb_entry(parent, struct sp_node, nd);
938 if (new->start < nd->start)
940 else if (new->end > nd->end)
945 rb_link_node(&new->nd, parent, p);
946 rb_insert_color(&new->nd, &sp->root);
947 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
948 new->policy ? new->policy->policy : 0);
951 /* Find shared policy intersecting idx */
953 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
955 struct mempolicy *pol = NULL;
958 if (!sp->root.rb_node)
960 spin_lock(&sp->lock);
961 sn = sp_lookup(sp, idx, idx+1);
963 mpol_get(sn->policy);
966 spin_unlock(&sp->lock);
970 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
972 PDprintk("deleting %lx-l%x\n", n->start, n->end);
973 rb_erase(&n->nd, &sp->root);
974 mpol_free(n->policy);
975 kmem_cache_free(sn_cache, n);
979 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
981 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
992 /* Replace a policy range. */
993 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
994 unsigned long end, struct sp_node *new)
996 struct sp_node *n, *new2 = NULL;
999 spin_lock(&sp->lock);
1000 n = sp_lookup(sp, start, end);
1001 /* Take care of old policies in the same range. */
1002 while (n && n->start < end) {
1003 struct rb_node *next = rb_next(&n->nd);
1004 if (n->start >= start) {
1010 /* Old policy spanning whole new range. */
1013 spin_unlock(&sp->lock);
1014 new2 = sp_alloc(end, n->end, n->policy);
1020 sp_insert(sp, new2);
1023 /* Old crossing beginning, but not end (easy) */
1024 if (n->start < start && n->end > start)
1029 n = rb_entry(next, struct sp_node, nd);
1033 spin_unlock(&sp->lock);
1035 mpol_free(new2->policy);
1036 kmem_cache_free(sn_cache, new2);
1041 int mpol_set_shared_policy(struct shared_policy *info,
1042 struct vm_area_struct *vma, struct mempolicy *npol)
1045 struct sp_node *new = NULL;
1046 unsigned long sz = vma_pages(vma);
1048 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1050 sz, npol? npol->policy : -1,
1051 npol ? npol->v.nodes[0] : -1);
1054 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1058 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1060 kmem_cache_free(sn_cache, new);
1064 /* Free a backing policy store on inode delete. */
1065 void mpol_free_shared_policy(struct shared_policy *p)
1068 struct rb_node *next;
1070 if (!p->root.rb_node)
1072 spin_lock(&p->lock);
1073 next = rb_first(&p->root);
1075 n = rb_entry(next, struct sp_node, nd);
1076 next = rb_next(&n->nd);
1077 rb_erase(&n->nd, &p->root);
1078 mpol_free(n->policy);
1079 kmem_cache_free(sn_cache, n);
1081 spin_unlock(&p->lock);
1084 /* assumes fs == KERNEL_DS */
1085 void __init numa_policy_init(void)
1087 policy_cache = kmem_cache_create("numa_policy",
1088 sizeof(struct mempolicy),
1089 0, SLAB_PANIC, NULL, NULL);
1091 sn_cache = kmem_cache_create("shared_policy_node",
1092 sizeof(struct sp_node),
1093 0, SLAB_PANIC, NULL, NULL);
1095 /* Set interleaving policy for system init. This way not all
1096 the data structures allocated at system boot end up in node zero. */
1098 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1100 printk("numa_policy_init: interleaving failed\n");
1103 /* Reset policy of current process to default.
1104 * Assumes fs == KERNEL_DS */
1105 void numa_default_policy(void)
1107 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);