2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/gfp.h>
70 #include <linux/slab.h>
71 #include <linux/string.h>
72 #include <linux/module.h>
73 #include <linux/interrupt.h>
74 #include <linux/init.h>
75 #include <linux/compat.h>
76 #include <linux/mempolicy.h>
77 #include <asm/uaccess.h>
79 static kmem_cache_t *policy_cache;
80 static kmem_cache_t *sn_cache;
82 #define PDprintk(fmt...)
84 /* Highest zone. An specific allocation for a zone below that is not
86 static int policy_zone;
88 static struct mempolicy default_policy = {
89 .refcnt = ATOMIC_INIT(1), /* never free it */
90 .policy = MPOL_DEFAULT,
93 /* Check if all specified nodes are online */
94 static int nodes_online(unsigned long *nodes)
96 DECLARE_BITMAP(online2, MAX_NUMNODES);
98 bitmap_copy(online2, node_online_map, MAX_NUMNODES);
99 if (bitmap_empty(online2, MAX_NUMNODES))
101 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
106 /* Do sanity checking on a policy */
107 static int mpol_check_policy(int mode, unsigned long *nodes)
109 int empty = bitmap_empty(nodes, MAX_NUMNODES);
117 case MPOL_INTERLEAVE:
118 /* Preferred will only use the first bit, but allow
124 return nodes_online(nodes);
127 /* Copy a node mask from user space. */
128 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
129 unsigned long maxnode, int mode)
132 unsigned long nlongs;
133 unsigned long endmask;
136 bitmap_zero(nodes, MAX_NUMNODES);
137 if (maxnode == 0 || !nmask)
140 nlongs = BITS_TO_LONGS(maxnode);
141 if ((maxnode % BITS_PER_LONG) == 0)
144 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
146 /* When the user specified more nodes than supported just check
147 if the non supported part is all zero. */
148 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
149 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
151 if (get_user(t, nmask + k))
153 if (k == nlongs - 1) {
159 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
163 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
165 nodes[nlongs-1] &= endmask;
166 return mpol_check_policy(mode, nodes);
169 /* Generate a custom zonelist for the BIND policy. */
170 static struct zonelist *bind_zonelist(unsigned long *nodes)
175 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
176 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
180 for (nd = find_first_bit(nodes, MAX_NUMNODES);
182 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
184 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
185 struct zone *z = &NODE_DATA(nd)->node_zones[k];
186 if (!z->present_pages)
188 zl->zones[num++] = z;
194 zl->zones[num] = NULL;
198 /* Create a new policy */
199 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
201 struct mempolicy *policy;
203 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
204 if (mode == MPOL_DEFAULT)
206 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
208 return ERR_PTR(-ENOMEM);
209 atomic_set(&policy->refcnt, 1);
211 case MPOL_INTERLEAVE:
212 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
215 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
216 if (policy->v.preferred_node >= MAX_NUMNODES)
217 policy->v.preferred_node = -1;
220 policy->v.zonelist = bind_zonelist(nodes);
221 if (policy->v.zonelist == NULL) {
222 kmem_cache_free(policy_cache, policy);
223 return ERR_PTR(-ENOMEM);
227 policy->policy = mode;
231 /* Ensure all existing pages follow the policy. */
233 verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
239 pgd_t *pgd = pgd_offset_k(addr);
240 if (pgd_none(*pgd)) {
241 addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
244 pmd = pmd_offset(pgd, addr);
245 if (pmd_none(*pmd)) {
246 addr = (addr + PMD_SIZE) & PMD_MASK;
250 pte = pte_offset_map(pmd, addr);
251 if (pte_present(*pte))
255 unsigned nid = page_to_nid(p);
256 if (!test_bit(nid, nodes))
264 /* Step 1: check the range */
265 static struct vm_area_struct *
266 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
267 unsigned long *nodes, unsigned long flags)
270 struct vm_area_struct *first, *vma, *prev;
272 first = find_vma(mm, start);
274 return ERR_PTR(-EFAULT);
276 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
277 if (!vma->vm_next && vma->vm_end < end)
278 return ERR_PTR(-EFAULT);
279 if (prev && prev->vm_end < vma->vm_start)
280 return ERR_PTR(-EFAULT);
281 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
282 err = verify_pages(vma->vm_start, vma->vm_end, nodes);
284 first = ERR_PTR(err);
293 /* Apply policy to a single VMA */
294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
297 struct mempolicy *old = vma->vm_policy;
299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 vma->vm_ops, vma->vm_file,
302 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
304 if (vma->vm_ops && vma->vm_ops->set_policy)
305 err = vma->vm_ops->set_policy(vma, new);
308 vma->vm_policy = new;
314 /* Step 2: apply policy to a range and do splits. */
315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 unsigned long end, struct mempolicy *new)
318 struct vm_area_struct *next;
322 for (; vma && vma->vm_start < end; vma = next) {
324 if (vma->vm_start < start)
325 err = split_vma(vma->vm_mm, vma, start, 1);
326 if (!err && vma->vm_end > end)
327 err = split_vma(vma->vm_mm, vma, end, 0);
329 err = policy_vma(vma, new);
336 /* Change policy for a memory range */
337 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
339 unsigned long __user *nmask, unsigned long maxnode,
342 struct vm_area_struct *vma;
343 struct mm_struct *mm = current->mm;
344 struct mempolicy *new;
346 DECLARE_BITMAP(nodes, MAX_NUMNODES);
349 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
351 if (start & ~PAGE_MASK)
353 if (mode == MPOL_DEFAULT)
354 flags &= ~MPOL_MF_STRICT;
355 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
362 err = get_nodes(nodes, nmask, maxnode, mode);
366 new = mpol_new(mode, nodes);
370 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
373 down_write(&mm->mmap_sem);
374 vma = check_range(mm, start, end, nodes, flags);
377 err = mbind_range(vma, start, end, new);
378 up_write(&mm->mmap_sem);
383 /* Set the process memory policy */
384 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
385 unsigned long maxnode)
388 struct mempolicy *new;
389 DECLARE_BITMAP(nodes, MAX_NUMNODES);
393 err = get_nodes(nodes, nmask, maxnode, mode);
396 new = mpol_new(mode, nodes);
399 mpol_free(current->mempolicy);
400 current->mempolicy = new;
401 if (new && new->policy == MPOL_INTERLEAVE)
402 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
406 /* Fill a zone bitmap for a policy */
407 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
411 bitmap_zero(nodes, MAX_NUMNODES);
414 for (i = 0; p->v.zonelist->zones[i]; i++)
415 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
419 case MPOL_INTERLEAVE:
420 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
423 /* or use current node instead of online map? */
424 if (p->v.preferred_node < 0)
425 bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
427 __set_bit(p->v.preferred_node, nodes);
434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
441 err = page_zone(p)->zone_pgdat->node_id;
447 /* Copy a kernel node mask to user space */
448 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
449 void *nodes, unsigned nbytes)
451 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
454 if (copy > PAGE_SIZE)
456 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
460 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
463 /* Retrieve NUMA policy */
464 asmlinkage long sys_get_mempolicy(int __user *policy,
465 unsigned long __user *nmask,
466 unsigned long maxnode,
467 unsigned long addr, unsigned long flags)
470 struct mm_struct *mm = current->mm;
471 struct vm_area_struct *vma = NULL;
472 struct mempolicy *pol = current->mempolicy;
474 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
476 if (nmask != NULL && maxnode < numnodes)
478 if (flags & MPOL_F_ADDR) {
479 down_read(&mm->mmap_sem);
480 vma = find_vma_intersection(mm, addr, addr+1);
482 up_read(&mm->mmap_sem);
485 if (vma->vm_ops && vma->vm_ops->get_policy)
486 pol = vma->vm_ops->get_policy(vma, addr);
488 pol = vma->vm_policy;
493 pol = &default_policy;
495 if (flags & MPOL_F_NODE) {
496 if (flags & MPOL_F_ADDR) {
497 err = lookup_node(mm, addr);
501 } else if (pol == current->mempolicy &&
502 pol->policy == MPOL_INTERLEAVE) {
503 pval = current->il_next;
512 if (policy && put_user(pval, policy))
517 DECLARE_BITMAP(nodes, MAX_NUMNODES);
518 get_zonemask(pol, nodes);
519 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
524 up_read(¤t->mm->mmap_sem);
529 /* The other functions are compatible */
530 asmlinkage long compat_get_mempolicy(int __user *policy,
531 unsigned __user *nmask, unsigned maxnode,
532 unsigned addr, unsigned flags)
535 unsigned long __user *nm = NULL;
537 nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
538 err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
539 if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
545 /* Return effective policy for a VMA */
546 static struct mempolicy *
547 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
549 struct mempolicy *pol = current->mempolicy;
552 if (vma->vm_ops && vma->vm_ops->get_policy)
553 pol = vma->vm_ops->get_policy(vma, addr);
554 else if (vma->vm_policy &&
555 vma->vm_policy->policy != MPOL_DEFAULT)
556 pol = vma->vm_policy;
559 pol = &default_policy;
563 /* Return a zonelist representing a mempolicy */
564 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
568 switch (policy->policy) {
570 nd = policy->v.preferred_node;
575 /* Lower zones don't get a policy applied */
576 if (gfp >= policy_zone)
577 return policy->v.zonelist;
579 case MPOL_INTERLEAVE: /* should not happen */
587 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
590 /* Do dynamic interleaving for a process */
591 static unsigned interleave_nodes(struct mempolicy *policy)
594 struct task_struct *me = current;
597 BUG_ON(nid >= MAX_NUMNODES);
598 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
599 if (next >= MAX_NUMNODES)
600 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
605 /* Do static interleaving for a VMA with known offset. */
606 static unsigned offset_il_node(struct mempolicy *pol,
607 struct vm_area_struct *vma, unsigned long off)
609 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
610 unsigned target = (unsigned)off % nnodes;
616 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
618 } while (c <= target);
619 BUG_ON(nid >= MAX_NUMNODES);
620 BUG_ON(!test_bit(nid, pol->v.nodes));
624 /* Allocate a page in interleaved policy.
625 Own path because it needs to do special accounting. */
626 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
631 BUG_ON(!test_bit(nid, node_online_map));
632 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
633 page = __alloc_pages(gfp, order, zl);
634 if (page && page_zone(page) == zl->zones[0]) {
635 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
642 * alloc_page_vma - Allocate a page for a VMA.
645 * %GFP_USER user allocation.
646 * %GFP_KERNEL kernel allocations,
647 * %GFP_HIGHMEM highmem/user allocations,
648 * %GFP_FS allocation should not call back into a file system.
649 * %GFP_ATOMIC don't sleep.
651 * @vma: Pointer to VMA or NULL if not available.
652 * @addr: Virtual Address of the allocation. Must be inside the VMA.
654 * This function allocates a page from the kernel page pool and applies
655 * a NUMA policy associated with the VMA or the current process.
656 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
657 * mm_struct of the VMA to prevent it from going away. Should be used for
658 * all allocations for pages that will be mapped into
659 * user space. Returns NULL when no page can be allocated.
661 * Should be called with the mm_sem of the vma hold.
664 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
666 struct mempolicy *pol = get_vma_policy(vma, addr);
668 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
672 BUG_ON(addr >= vma->vm_end);
673 BUG_ON(addr < vma->vm_start);
675 off += (addr - vma->vm_start) >> PAGE_SHIFT;
676 nid = offset_il_node(pol, vma, off);
678 /* fall back to process interleaving */
679 nid = interleave_nodes(pol);
681 return alloc_page_interleave(gfp, 0, nid);
683 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
687 * alloc_pages_current - Allocate pages.
690 * %GFP_USER user allocation,
691 * %GFP_KERNEL kernel allocation,
692 * %GFP_HIGHMEM highmem allocation,
693 * %GFP_FS don't call back into a file system.
694 * %GFP_ATOMIC don't sleep.
695 * @order: Power of two of allocation size in pages. 0 is a single page.
697 * Allocate a page from the kernel page pool. When not in
698 * interrupt context and apply the current process NUMA policy.
699 * Returns NULL when no page can be allocated.
701 struct page *alloc_pages_current(unsigned gfp, unsigned order)
703 struct mempolicy *pol = current->mempolicy;
705 if (!pol || in_interrupt())
706 pol = &default_policy;
707 if (pol->policy == MPOL_INTERLEAVE)
708 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
709 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
711 EXPORT_SYMBOL(alloc_pages_current);
713 /* Slow path of a mempolicy copy */
714 struct mempolicy *__mpol_copy(struct mempolicy *old)
716 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
719 return ERR_PTR(-ENOMEM);
721 atomic_set(&new->refcnt, 1);
722 if (new->policy == MPOL_BIND) {
723 int sz = ksize(old->v.zonelist);
724 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
725 if (!new->v.zonelist) {
726 kmem_cache_free(policy_cache, new);
727 return ERR_PTR(-ENOMEM);
729 memcpy(new->v.zonelist, old->v.zonelist, sz);
734 /* Slow path of a mempolicy comparison */
735 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
739 if (a->policy != b->policy)
744 case MPOL_INTERLEAVE:
745 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
747 return a->v.preferred_node == b->v.preferred_node;
750 for (i = 0; a->v.zonelist->zones[i]; i++)
751 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
753 return b->v.zonelist->zones[i] == NULL;
761 /* Slow path of a mpol destructor. */
762 void __mpol_free(struct mempolicy *p)
764 if (!atomic_dec_and_test(&p->refcnt))
766 if (p->policy == MPOL_BIND)
767 kfree(p->v.zonelist);
768 p->policy = MPOL_DEFAULT;
769 kmem_cache_free(policy_cache, p);
773 * Hugetlb policy. Same as above, just works with node numbers instead of
777 /* Find first node suitable for an allocation */
778 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
780 struct mempolicy *pol = get_vma_policy(vma, addr);
782 switch (pol->policy) {
784 return numa_node_id();
786 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
787 case MPOL_INTERLEAVE:
788 return interleave_nodes(pol);
790 return pol->v.preferred_node >= 0 ?
791 pol->v.preferred_node : numa_node_id();
797 /* Find secondary valid nodes for an allocation */
798 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
800 struct mempolicy *pol = get_vma_policy(vma, addr);
802 switch (pol->policy) {
805 case MPOL_INTERLEAVE:
809 for (z = pol->v.zonelist->zones; *z; z++)
810 if ((*z)->zone_pgdat->node_id == nid)
821 * Shared memory backing store policy support.
823 * Remember policies even when nobody has shared memory mapped.
824 * The policies are kept in Red-Black tree linked from the inode.
825 * They are protected by the sp->sem semaphore, which should be held
826 * for any accesses to the tree.
829 /* lookup first element intersecting start-end */
830 /* Caller holds sp->sem */
831 static struct sp_node *
832 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
834 struct rb_node *n = sp->root.rb_node;
837 struct sp_node *p = rb_entry(n, struct sp_node, nd);
838 if (start >= p->end) {
840 } else if (end < p->start) {
849 struct sp_node *w = NULL;
850 struct rb_node *prev = rb_prev(n);
853 w = rb_entry(prev, struct sp_node, nd);
858 return rb_entry(n, struct sp_node, nd);
861 /* Insert a new shared policy into the list. */
862 /* Caller holds sp->sem */
863 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
865 struct rb_node **p = &sp->root.rb_node;
866 struct rb_node *parent = NULL;
871 nd = rb_entry(parent, struct sp_node, nd);
872 if (new->start < nd->start)
874 else if (new->end > nd->end)
879 rb_link_node(&new->nd, parent, p);
880 rb_insert_color(&new->nd, &sp->root);
881 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
882 new->policy ? new->policy->policy : 0);
885 /* Find shared policy intersecting idx */
887 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
889 struct mempolicy *pol = NULL;
893 sn = sp_lookup(sp, idx, idx+1);
895 mpol_get(sn->policy);
902 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
904 PDprintk("deleting %lx-l%x\n", n->start, n->end);
905 rb_erase(&n->nd, &sp->root);
906 mpol_free(n->policy);
907 kmem_cache_free(sn_cache, n);
911 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
913 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
924 /* Replace a policy range. */
925 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
926 unsigned long end, struct sp_node *new)
928 struct sp_node *n, *new2;
931 n = sp_lookup(sp, start, end);
932 /* Take care of old policies in the same range. */
933 while (n && n->start < end) {
934 struct rb_node *next = rb_next(&n->nd);
935 if (n->start >= start) {
941 /* Old policy spanning whole new range. */
943 new2 = sp_alloc(end, n->end, n->policy);
951 /* Old crossing beginning, but not end (easy) */
952 if (n->start < start && n->end > start)
957 n = rb_entry(next, struct sp_node, nd);
965 int mpol_set_shared_policy(struct shared_policy *info,
966 struct vm_area_struct *vma, struct mempolicy *npol)
969 struct sp_node *new = NULL;
970 unsigned long sz = vma_pages(vma);
972 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
974 sz, npol? npol->policy : -1,
975 npol ? npol->v.nodes[0] : -1);
978 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
982 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
984 kmem_cache_free(sn_cache, new);
988 /* Free a backing policy store on inode delete. */
989 void mpol_free_shared_policy(struct shared_policy *p)
992 struct rb_node *next;
995 next = rb_first(&p->root);
997 n = rb_entry(next, struct sp_node, nd);
998 next = rb_next(&n->nd);
999 rb_erase(&n->nd, &p->root);
1000 mpol_free(n->policy);
1001 kmem_cache_free(sn_cache, n);
1006 /* assumes fs == KERNEL_DS */
1007 void __init numa_policy_init(void)
1009 policy_cache = kmem_cache_create("numa_policy",
1010 sizeof(struct mempolicy),
1011 0, SLAB_PANIC, NULL, NULL);
1013 sn_cache = kmem_cache_create("shared_policy_node",
1014 sizeof(struct sp_node),
1015 0, SLAB_PANIC, NULL, NULL);
1017 /* Set interleaving policy for system init. This way not all
1018 the data structures allocated at system boot end up in node zero. */
1020 if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0)
1021 printk("numa_policy_init: interleaving failed\n");
1024 /* Reset policy of current process to default.
1025 * Assumes fs == KERNEL_DS */
1026 void numa_default_policy(void)
1028 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);