2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/hugetlb.h>
65 #include <linux/kernel.h>
66 #include <linux/sched.h>
68 #include <linux/gfp.h>
69 #include <linux/slab.h>
70 #include <linux/string.h>
71 #include <linux/module.h>
72 #include <linux/interrupt.h>
73 #include <linux/init.h>
74 #include <linux/compat.h>
75 #include <linux/mempolicy.h>
76 #include <asm/uaccess.h>
78 static kmem_cache_t *policy_cache;
79 static kmem_cache_t *sn_cache;
81 #define PDprintk(fmt...)
83 /* Highest zone. An specific allocation for a zone below that is not
85 static int policy_zone;
87 static struct mempolicy default_policy = {
88 .refcnt = ATOMIC_INIT(1), /* never free it */
89 .policy = MPOL_DEFAULT,
92 /* Check if all specified nodes are online */
93 static int nodes_online(unsigned long *nodes)
95 DECLARE_BITMAP(offline, MAX_NUMNODES);
97 bitmap_copy(offline, node_online_map, MAX_NUMNODES);
98 if (bitmap_empty(offline, MAX_NUMNODES))
100 bitmap_complement(offline, MAX_NUMNODES);
101 bitmap_and(offline, offline, nodes, MAX_NUMNODES);
102 if (!bitmap_empty(offline, MAX_NUMNODES))
107 /* Do sanity checking on a policy */
108 static int mpol_check_policy(int mode, unsigned long *nodes)
110 int empty = bitmap_empty(nodes, MAX_NUMNODES);
118 case MPOL_INTERLEAVE:
119 /* Preferred will only use the first bit, but allow
125 return nodes_online(nodes);
128 /* Copy a node mask from user space. */
129 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
130 unsigned long maxnode, int mode)
133 unsigned long nlongs;
134 unsigned long endmask;
137 nlongs = BITS_TO_LONGS(maxnode);
138 if ((maxnode % BITS_PER_LONG) == 0)
141 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
143 /* When the user specified more nodes than supported just check
144 if the non supported part is all zero. */
145 if (nmask && nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
146 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
148 if (get_user(t, nmask + k))
150 if (k == nlongs - 1) {
156 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
160 bitmap_zero(nodes, MAX_NUMNODES);
161 if (nmask && copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
163 nodes[nlongs-1] &= endmask;
164 return mpol_check_policy(mode, nodes);
167 /* Generate a custom zonelist for the BIND policy. */
168 static struct zonelist *bind_zonelist(unsigned long *nodes)
173 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
174 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
178 for (nd = find_first_bit(nodes, MAX_NUMNODES);
180 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
182 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
183 struct zone *z = &NODE_DATA(nd)->node_zones[k];
184 if (!z->present_pages)
186 zl->zones[num++] = z;
192 zl->zones[num] = NULL;
196 /* Create a new policy */
197 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
199 struct mempolicy *policy;
201 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
202 if (mode == MPOL_DEFAULT)
204 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
206 return ERR_PTR(-ENOMEM);
207 atomic_set(&policy->refcnt, 1);
209 case MPOL_INTERLEAVE:
210 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
213 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
214 if (policy->v.preferred_node >= MAX_NUMNODES)
215 policy->v.preferred_node = -1;
218 policy->v.zonelist = bind_zonelist(nodes);
219 if (policy->v.zonelist == NULL) {
220 kmem_cache_free(policy_cache, policy);
221 return ERR_PTR(-ENOMEM);
225 policy->policy = mode;
229 /* Ensure all existing pages follow the policy. */
231 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
237 pgd_t *pgd = pgd_offset_k(addr);
238 if (pgd_none(*pgd)) {
239 addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
242 pmd = pmd_offset(pgd, addr);
243 if (pmd_none(*pmd)) {
244 addr = (addr + PMD_SIZE) & PMD_MASK;
248 pte = pte_offset_map(pmd, addr);
249 if (pte_present(*pte))
253 unsigned nid = page_to_nid(p);
254 if (!test_bit(nid, nodes))
262 /* Step 1: check the range */
263 static struct vm_area_struct *
264 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
265 unsigned long *nodes, unsigned long flags)
268 struct vm_area_struct *first, *vma, *prev;
270 first = find_vma(mm, start);
272 return ERR_PTR(-EFAULT);
274 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
275 if (!vma->vm_next && vma->vm_end < end)
276 return ERR_PTR(-EFAULT);
277 if (prev && prev->vm_end < vma->vm_start)
278 return ERR_PTR(-EFAULT);
279 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
280 err = verify_pages(vma->vm_start, vma->vm_end, nodes);
282 first = ERR_PTR(err);
291 /* Apply policy to a single VMA */
292 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
295 struct mempolicy *old = vma->vm_policy;
297 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
298 vma->vm_start, vma->vm_end, vma->vm_pgoff,
299 vma->vm_ops, vma->vm_file,
300 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
302 if (vma->vm_ops && vma->vm_ops->set_policy)
303 err = vma->vm_ops->set_policy(vma, new);
306 vma->vm_policy = new;
312 /* Step 2: apply policy to a range and do splits. */
313 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
314 unsigned long end, struct mempolicy *new)
316 struct vm_area_struct *next;
320 for (; vma && vma->vm_start < end; vma = next) {
322 if (vma->vm_start < start)
323 err = split_vma(vma->vm_mm, vma, start, 1);
324 if (!err && vma->vm_end > end)
325 err = split_vma(vma->vm_mm, vma, end, 0);
327 err = policy_vma(vma, new);
334 /* Change policy for a memory range */
335 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
337 unsigned long __user *nmask, unsigned long maxnode,
340 struct vm_area_struct *vma;
341 struct mm_struct *mm = current->mm;
342 struct mempolicy *new;
344 DECLARE_BITMAP(nodes, MAX_NUMNODES);
347 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
349 if (start & ~PAGE_MASK)
351 if (mode == MPOL_DEFAULT)
352 flags &= ~MPOL_MF_STRICT;
353 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
360 err = get_nodes(nodes, nmask, maxnode, mode);
364 new = mpol_new(mode, nodes);
368 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
371 down_write(&mm->mmap_sem);
372 vma = check_range(mm, start, end, nodes, flags);
375 err = mbind_range(vma, start, end, new);
376 up_write(&mm->mmap_sem);
381 /* Set the process memory policy */
382 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
383 unsigned long maxnode)
386 struct mempolicy *new;
387 DECLARE_BITMAP(nodes, MAX_NUMNODES);
391 err = get_nodes(nodes, nmask, maxnode, mode);
394 new = mpol_new(mode, nodes);
397 mpol_free(current->mempolicy);
398 current->mempolicy = new;
399 if (new && new->policy == MPOL_INTERLEAVE)
400 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
404 /* Fill a zone bitmap for a policy */
405 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
409 bitmap_zero(nodes, MAX_NUMNODES);
412 for (i = 0; p->v.zonelist->zones[i]; i++)
413 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
417 case MPOL_INTERLEAVE:
418 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
421 /* or use current node instead of online map? */
422 if (p->v.preferred_node < 0)
423 bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
425 __set_bit(p->v.preferred_node, nodes);
432 static int lookup_node(struct mm_struct *mm, unsigned long addr)
437 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
439 err = page_zone(p)->zone_pgdat->node_id;
445 /* Copy a kernel node mask to user space */
446 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
447 void *nodes, unsigned nbytes)
449 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
452 if (copy > PAGE_SIZE)
454 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
458 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
461 /* Retrieve NUMA policy */
462 asmlinkage long sys_get_mempolicy(int __user *policy,
463 unsigned long __user *nmask,
464 unsigned long maxnode,
465 unsigned long addr, unsigned long flags)
468 struct mm_struct *mm = current->mm;
469 struct vm_area_struct *vma = NULL;
470 struct mempolicy *pol = current->mempolicy;
472 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
474 if (nmask != NULL && maxnode < numnodes)
476 if (flags & MPOL_F_ADDR) {
477 down_read(&mm->mmap_sem);
478 vma = find_vma_intersection(mm, addr, addr+1);
480 up_read(&mm->mmap_sem);
483 if (vma->vm_ops && vma->vm_ops->get_policy)
484 pol = vma->vm_ops->get_policy(vma, addr);
486 pol = vma->vm_policy;
491 pol = &default_policy;
493 if (flags & MPOL_F_NODE) {
494 if (flags & MPOL_F_ADDR) {
495 err = lookup_node(mm, addr);
499 } else if (pol == current->mempolicy &&
500 pol->policy == MPOL_INTERLEAVE) {
501 pval = current->il_next;
510 if (policy && put_user(pval, policy))
515 DECLARE_BITMAP(nodes, MAX_NUMNODES);
516 get_zonemask(pol, nodes);
517 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
522 up_read(¤t->mm->mmap_sem);
527 /* The other functions are compatible */
528 asmlinkage long compat_get_mempolicy(int __user *policy,
529 unsigned __user *nmask, unsigned maxnode,
530 unsigned addr, unsigned flags)
533 unsigned long __user *nm = NULL;
535 nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
536 err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
537 if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
543 /* Return effective policy for a VMA */
544 static struct mempolicy *
545 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
547 struct mempolicy *pol = current->mempolicy;
550 if (vma->vm_ops && vma->vm_ops->get_policy)
551 pol = vma->vm_ops->get_policy(vma, addr);
552 else if (vma->vm_policy &&
553 vma->vm_policy->policy != MPOL_DEFAULT)
554 pol = vma->vm_policy;
557 pol = &default_policy;
561 /* Return a zonelist representing a mempolicy */
562 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
566 switch (policy->policy) {
568 nd = policy->v.preferred_node;
573 /* Lower zones don't get a policy applied */
574 if (gfp >= policy_zone)
575 return policy->v.zonelist;
577 case MPOL_INTERLEAVE: /* should not happen */
585 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
588 /* Do dynamic interleaving for a process */
589 static unsigned interleave_nodes(struct mempolicy *policy)
592 struct task_struct *me = current;
595 BUG_ON(nid >= MAX_NUMNODES);
596 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
597 if (next >= MAX_NUMNODES)
598 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
603 /* Do static interleaving for a VMA with known offset. */
604 static unsigned offset_il_node(struct mempolicy *pol,
605 struct vm_area_struct *vma, unsigned long off)
607 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
608 unsigned target = (unsigned)off % nnodes;
614 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
616 } while (c <= target);
617 BUG_ON(nid >= MAX_NUMNODES);
618 BUG_ON(!test_bit(nid, pol->v.nodes));
622 /* Allocate a page in interleaved policy.
623 Own path because it needs to do special accounting. */
624 static struct page *alloc_page_interleave(unsigned gfp, unsigned nid)
629 BUG_ON(!test_bit(nid, node_online_map));
630 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
631 page = __alloc_pages(gfp, 0, zl);
632 if (page && page_zone(page) == zl->zones[0]) {
633 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
640 * alloc_page_vma - Allocate a page for a VMA.
643 * %GFP_USER user allocation.
644 * %GFP_KERNEL kernel allocations,
645 * %GFP_HIGHMEM highmem/user allocations,
646 * %GFP_FS allocation should not call back into a file system.
647 * %GFP_ATOMIC don't sleep.
649 * @vma: Pointer to VMA or NULL if not available.
650 * @addr: Virtual Address of the allocation. Must be inside the VMA.
652 * This function allocates a page from the kernel page pool and applies
653 * a NUMA policy associated with the VMA or the current process.
654 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
655 * mm_struct of the VMA to prevent it from going away. Should be used for
656 * all allocations for pages that will be mapped into
657 * user space. Returns NULL when no page can be allocated.
659 * Should be called with the mm_sem of the vma hold.
662 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
664 struct mempolicy *pol = get_vma_policy(vma, addr);
666 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
670 BUG_ON(addr >= vma->vm_end);
671 BUG_ON(addr < vma->vm_start);
673 off += (addr - vma->vm_start) >> PAGE_SHIFT;
674 nid = offset_il_node(pol, vma, off);
676 /* fall back to process interleaving */
677 nid = interleave_nodes(pol);
679 return alloc_page_interleave(gfp, nid);
681 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
685 * alloc_pages_current - Allocate pages.
688 * %GFP_USER user allocation,
689 * %GFP_KERNEL kernel allocation,
690 * %GFP_HIGHMEM highmem allocation,
691 * %GFP_FS don't call back into a file system.
692 * %GFP_ATOMIC don't sleep.
693 * @order: Power of two of allocation size in pages. 0 is a single page.
695 * Allocate a page from the kernel page pool. When not in
696 * interrupt context and apply the current process NUMA policy.
697 * Returns NULL when no page can be allocated.
699 struct page *alloc_pages_current(unsigned gfp, unsigned order)
701 struct mempolicy *pol = current->mempolicy;
703 if (!pol || in_interrupt())
704 pol = &default_policy;
705 if (pol->policy == MPOL_INTERLEAVE && order == 0)
706 return alloc_page_interleave(gfp, interleave_nodes(pol));
707 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
709 EXPORT_SYMBOL(alloc_pages_current);
711 /* Slow path of a mempolicy copy */
712 struct mempolicy *__mpol_copy(struct mempolicy *old)
714 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
717 return ERR_PTR(-ENOMEM);
719 atomic_set(&new->refcnt, 1);
720 if (new->policy == MPOL_BIND) {
721 int sz = ksize(old->v.zonelist);
722 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
723 if (!new->v.zonelist) {
724 kmem_cache_free(policy_cache, new);
725 return ERR_PTR(-ENOMEM);
727 memcpy(new->v.zonelist, old->v.zonelist, sz);
732 /* Slow path of a mempolicy comparison */
733 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
737 if (a->policy != b->policy)
742 case MPOL_INTERLEAVE:
743 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
745 return a->v.preferred_node == b->v.preferred_node;
748 for (i = 0; a->v.zonelist->zones[i]; i++)
749 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
751 return b->v.zonelist->zones[i] == NULL;
759 /* Slow path of a mpol destructor. */
760 void __mpol_free(struct mempolicy *p)
762 if (!atomic_dec_and_test(&p->refcnt))
764 if (p->policy == MPOL_BIND)
765 kfree(p->v.zonelist);
766 p->policy = MPOL_DEFAULT;
767 kmem_cache_free(policy_cache, p);
771 * Hugetlb policy. Same as above, just works with node numbers instead of
775 /* Find first node suitable for an allocation */
776 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
778 struct mempolicy *pol = get_vma_policy(vma, addr);
780 switch (pol->policy) {
782 return numa_node_id();
784 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
785 case MPOL_INTERLEAVE:
786 return interleave_nodes(pol);
788 return pol->v.preferred_node >= 0 ?
789 pol->v.preferred_node : numa_node_id();
795 /* Find secondary valid nodes for an allocation */
796 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
798 struct mempolicy *pol = get_vma_policy(vma, addr);
800 switch (pol->policy) {
803 case MPOL_INTERLEAVE:
807 for (z = pol->v.zonelist->zones; *z; z++)
808 if ((*z)->zone_pgdat->node_id == nid)
819 * Shared memory backing store policy support.
821 * Remember policies even when nobody has shared memory mapped.
822 * The policies are kept in Red-Black tree linked from the inode.
823 * They are protected by the sp->sem semaphore, which should be held
824 * for any accesses to the tree.
827 /* lookup first element intersecting start-end */
828 /* Caller holds sp->sem */
829 static struct sp_node *
830 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
832 struct rb_node *n = sp->root.rb_node;
835 struct sp_node *p = rb_entry(n, struct sp_node, nd);
836 if (start >= p->end) {
838 } else if (end < p->start) {
847 struct sp_node *w = NULL;
848 struct rb_node *prev = rb_prev(n);
851 w = rb_entry(prev, struct sp_node, nd);
856 return rb_entry(n, struct sp_node, nd);
859 /* Insert a new shared policy into the list. */
860 /* Caller holds sp->sem */
861 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
863 struct rb_node **p = &sp->root.rb_node;
864 struct rb_node *parent = NULL;
869 nd = rb_entry(parent, struct sp_node, nd);
870 if (new->start < nd->start)
872 else if (new->end > nd->end)
877 rb_link_node(&new->nd, parent, p);
878 rb_insert_color(&new->nd, &sp->root);
879 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
880 new->policy ? new->policy->policy : 0);
883 /* Find shared policy intersecting idx */
885 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
887 struct mempolicy *pol = NULL;
891 sn = sp_lookup(sp, idx, idx+1);
893 mpol_get(sn->policy);
900 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
902 PDprintk("deleting %lx-l%x\n", n->start, n->end);
903 rb_erase(&n->nd, &sp->root);
904 mpol_free(n->policy);
905 kmem_cache_free(sn_cache, n);
909 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
911 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
922 /* Replace a policy range. */
923 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
924 unsigned long end, struct sp_node *new)
926 struct sp_node *n, *new2;
929 n = sp_lookup(sp, start, end);
930 /* Take care of old policies in the same range. */
931 while (n && n->start < end) {
932 struct rb_node *next = rb_next(&n->nd);
933 if (n->start >= start) {
939 /* Old policy spanning whole new range. */
941 new2 = sp_alloc(end, n->end, n->policy);
949 /* Old crossing beginning, but not end (easy) */
950 if (n->start < start && n->end > start)
955 n = rb_entry(next, struct sp_node, nd);
963 int mpol_set_shared_policy(struct shared_policy *info,
964 struct vm_area_struct *vma, struct mempolicy *npol)
967 struct sp_node *new = NULL;
968 unsigned long sz = vma_pages(vma);
970 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
972 sz, npol? npol->policy : -1,
973 npol ? npol->v.nodes[0] : -1);
976 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
980 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
982 kmem_cache_free(sn_cache, new);
986 /* Free a backing policy store on inode delete. */
987 void mpol_free_shared_policy(struct shared_policy *p)
990 struct rb_node *next;
993 next = rb_first(&p->root);
995 n = rb_entry(next, struct sp_node, nd);
996 next = rb_next(&n->nd);
997 rb_erase(&n->nd, &p->root);
998 mpol_free(n->policy);
999 kmem_cache_free(sn_cache, n);
1004 static __init int numa_policy_init(void)
1006 policy_cache = kmem_cache_create("numa_policy",
1007 sizeof(struct mempolicy),
1008 0, SLAB_PANIC, NULL, NULL);
1010 sn_cache = kmem_cache_create("shared_policy_node",
1011 sizeof(struct sp_node),
1012 0, SLAB_PANIC, NULL, NULL);
1015 module_init(numa_policy_init);