* process policy.
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
- * in a NUMA aware kernel and still does by, ahem, default.
+ * in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
#include <linux/mempolicy.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/nodemask.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/mempolicy.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
static kmem_cache_t *policy_cache;
/* Check if all specified nodes are online */
static int nodes_online(unsigned long *nodes)
{
- DECLARE_BITMAP(offline, MAX_NUMNODES);
+ DECLARE_BITMAP(online2, MAX_NUMNODES);
- bitmap_copy(offline, node_online_map, MAX_NUMNODES);
- if (bitmap_empty(offline, MAX_NUMNODES))
- set_bit(0, offline);
- bitmap_complement(offline, MAX_NUMNODES);
- bitmap_and(offline, offline, nodes, MAX_NUMNODES);
- if (!bitmap_empty(offline, MAX_NUMNODES))
+ bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
+ if (bitmap_empty(online2, MAX_NUMNODES))
+ set_bit(0, online2);
+ if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
return -EINVAL;
return 0;
}
unsigned long endmask;
--maxnode;
+ bitmap_zero(nodes, MAX_NUMNODES);
+ if (maxnode == 0 || !nmask)
+ return 0;
+
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
/* When the user specified more nodes than supported just check
if the non supported part is all zero. */
- if (nmask && nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
unsigned long t;
if (get_user(t, nmask + k))
endmask = ~0UL;
}
- bitmap_zero(nodes, MAX_NUMNODES);
- if (nmask && copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
+ if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes[nlongs-1] &= endmask;
return mpol_check_policy(mode, nodes);
/* Ensure all existing pages follow the policy. */
static int
-verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
+verify_pages(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, unsigned long *nodes)
{
while (addr < end) {
struct page *p;
pte_t *pte;
pmd_t *pmd;
- pgd_t *pgd = pgd_offset_k(addr);
+ pud_t *pud;
+ pgd_t *pgd;
+ pgd = pgd_offset(mm, addr);
if (pgd_none(*pgd)) {
- addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+ unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
+ if (next > addr)
+ break;
+ addr = next;
+ continue;
+ }
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ addr = (addr + PUD_SIZE) & PUD_MASK;
continue;
}
- pmd = pmd_offset(pgd, addr);
+ pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
addr = (addr + PMD_SIZE) & PMD_MASK;
continue;
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
- err = verify_pages(vma->vm_start, vma->vm_end, nodes);
+ err = verify_pages(vma->vm_mm,
+ vma->vm_start, vma->vm_end, nodes);
if (err) {
first = ERR_PTR(err);
break;
case MPOL_PREFERRED:
/* or use current node instead of online map? */
if (p->v.preferred_node < 0)
- bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
+ bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
else
__set_bit(p->v.preferred_node, nodes);
break;
err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
if (err >= 0) {
- err = page_zone(p)->zone_pgdat->node_id;
+ err = page_to_nid(p);
put_page(p);
}
return err;
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
- if (nmask != NULL && maxnode < numnodes)
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
return -EINVAL;
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
} else
pval = pol->policy;
- err = -EFAULT;
+ if (vma) {
+ up_read(¤t->mm->mmap_sem);
+ vma = NULL;
+ }
+
if (policy && put_user(pval, policy))
- goto out;
+ return -EFAULT;
err = 0;
if (nmask) {
}
#ifdef CONFIG_COMPAT
-/* The other functions are compatible */
-asmlinkage long compat_get_mempolicy(int __user *policy,
- unsigned __user *nmask, unsigned maxnode,
- unsigned addr, unsigned flags)
+
+asmlinkage long compat_sys_get_mempolicy(int __user *policy,
+ compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode,
+ compat_ulong_t addr, compat_ulong_t flags)
{
long err;
unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
if (nmask)
- nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
- err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
- if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
- err = -EFAULT;
+ nm = compat_alloc_user_space(alloc_size);
+
+ err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
+
+ if (!err && nmask) {
+ err = copy_from_user(bm, nm, alloc_size);
+ /* ensure entire bitmap is zeroed */
+ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
+ err |= compat_put_bitmap(nmask, bm, nr_bits);
+ }
+
return err;
}
+
+asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode)
+{
+ long err = 0;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ err = compat_get_bitmap(bm, nmask, nr_bits);
+ nm = compat_alloc_user_space(alloc_size);
+ err |= copy_to_user(nm, bm, alloc_size);
+ }
+
+ if (err)
+ return -EFAULT;
+
+ return sys_set_mempolicy(mode, nm, nr_bits+1);
+}
+
+asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
+ compat_ulong_t mode, compat_ulong_t __user *nmask,
+ compat_ulong_t maxnode, compat_ulong_t flags)
+{
+ long err = 0;
+ unsigned long __user *nm = NULL;
+ unsigned long nr_bits, alloc_size;
+ DECLARE_BITMAP(bm, MAX_NUMNODES);
+
+ nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+
+ if (nmask) {
+ err = compat_get_bitmap(bm, nmask, nr_bits);
+ nm = compat_alloc_user_space(alloc_size);
+ err |= copy_to_user(nm, bm, alloc_size);
+ }
+
+ if (err)
+ return -EFAULT;
+
+ return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
+}
+
#endif
/* Return effective policy for a VMA */
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(unsigned gfp, unsigned nid)
+static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
{
struct zonelist *zl;
struct page *page;
- BUG_ON(!test_bit(nid, node_online_map));
+ BUG_ON(!node_online(nid));
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
- page = __alloc_pages(gfp, 0, zl);
+ page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
zl->zones[0]->pageset[get_cpu()].interleave_hit++;
put_cpu();
/* fall back to process interleaving */
nid = interleave_nodes(pol);
}
- return alloc_page_interleave(gfp, nid);
+ return alloc_page_interleave(gfp, 0, nid);
}
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}
* alloc_pages_current - Allocate pages.
*
* @gfp:
- * %GFP_USER user allocation,
+ * %GFP_USER user allocation,
* %GFP_KERNEL kernel allocation,
* %GFP_HIGHMEM highmem allocation,
* %GFP_FS don't call back into a file system.
if (!pol || in_interrupt())
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE && order == 0)
- return alloc_page_interleave(gfp, interleave_nodes(pol));
+ if (pol->policy == MPOL_INTERLEAVE)
+ return alloc_page_interleave(gfp, order, interleave_nodes(pol));
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->sem semaphore, which should be held
+ * They are protected by the sp->lock spinlock, which should be held
* for any accesses to the tree.
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->sem */
+/* Caller holds sp->lock */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
- if (start >= p->end) {
+
+ if (start >= p->end)
n = n->rb_right;
- } else if (end < p->start) {
+ else if (end <= p->start)
n = n->rb_left;
- } else {
+ else
break;
- }
}
if (!n)
return NULL;
}
/* Insert a new shared policy into the list. */
-/* Caller holds sp->sem */
+/* Caller holds sp->lock */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct mempolicy *pol = NULL;
struct sp_node *sn;
- down(&sp->sem);
+ if (!sp->root.rb_node)
+ return NULL;
+ spin_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- up(&sp->sem);
+ spin_unlock(&sp->lock);
return pol;
}
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2;
+ struct sp_node *n, *new2 = NULL;
- down(&sp->sem);
+restart:
+ spin_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
- new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- up(&sp->sem);
- return -ENOMEM;
+ spin_unlock(&sp->lock);
+ new2 = sp_alloc(end, n->end, n->policy);
+ if (!new2)
+ return -ENOMEM;
+ goto restart;
}
- n->end = end;
+ n->end = start;
sp_insert(sp, new2);
- }
- /* Old crossing beginning, but not end (easy) */
- if (n->start < start && n->end > start)
+ new2 = NULL;
+ break;
+ } else
n->end = start;
}
if (!next)
}
if (new)
sp_insert(sp, new);
- up(&sp->sem);
+ spin_unlock(&sp->lock);
+ if (new2) {
+ mpol_free(new2->policy);
+ kmem_cache_free(sn_cache, new2);
+ }
return 0;
}
struct sp_node *n;
struct rb_node *next;
- down(&p->sem);
+ if (!p->root.rb_node)
+ return;
+ spin_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
mpol_free(n->policy);
kmem_cache_free(sn_cache, n);
}
- up(&p->sem);
+ spin_unlock(&p->lock);
+ p->root = RB_ROOT;
}
-static __init int numa_policy_init(void)
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
{
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL, NULL);
- return 0;
+
+ /* Set interleaving policy for system init. This way not all
+ the data structures allocated at system boot end up in node zero. */
+
+ if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
+ MAX_NUMNODES) < 0)
+ printk("numa_policy_init: interleaving failed\n");
+}
+
+/* Reset policy of current process to default.
+ * Assumes fs == KERNEL_DS */
+void numa_default_policy(void)
+{
+ sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
}
-module_init(numa_policy_init);