+ if (!pol)
+ pol = &default_policy;
+
+ if (flags & MPOL_F_NODE) {
+ if (flags & MPOL_F_ADDR) {
+ err = lookup_node(mm, addr);
+ if (err < 0)
+ goto out;
+ *policy = err;
+ } else if (pol == current->mempolicy &&
+ pol->policy == MPOL_INTERLEAVE) {
+ *policy = current->il_next;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+ } else
+ *policy = pol->policy;
+
+ if (vma) {
+ up_read(¤t->mm->mmap_sem);
+ vma = NULL;
+ }
+
+ err = 0;
+ if (nmask)
+ get_zonemask(pol, nmask);
+
+ out:
+ if (vma)
+ up_read(¤t->mm->mmap_sem);
+ return err;
+}
+
+#ifdef CONFIG_MIGRATION
+/*
+ * page migration
+ */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+ isolate_lru_page(page, pagelist);
+}
+
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+{
+ return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+ nodemask_t nmask;
+ LIST_HEAD(pagelist);
+ int err = 0;
+
+ nodes_clear(nmask);
+ node_set(source, nmask);
+
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_node_page, dest);
+
+ return err;
+}
+
+/*
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ LIST_HEAD(pagelist);
+ int busy = 0;
+ int err = 0;
+ nodemask_t tmp;
+
+ down_read(&mm->mmap_sem);
+
+ err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ if (err)
+ goto out;
+
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
+
+ tmp = *from_nodes;
+ while (!nodes_empty(tmp)) {
+ int s,d;
+ int source = -1;
+ int dest = 0;
+
+ for_each_node_mask(s, tmp) {
+ d = node_remap(s, *from_nodes, *to_nodes);
+ if (s == d)
+ continue;
+
+ source = s; /* Node moved. Memorize */
+ dest = d;
+
+ /* dest not in remaining from nodes? */
+ if (!node_isset(dest, tmp))
+ break;
+ }
+ if (source == -1)
+ break;
+
+ node_clear(source, tmp);
+ err = migrate_to_node(mm, source, dest, flags);
+ if (err > 0)
+ busy += err;
+ if (err < 0)
+ break;
+ }
+out:
+ up_read(&mm->mmap_sem);
+ if (err < 0)
+ return err;
+ return busy;
+
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+ return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ return NULL;
+}
+#endif
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ LIST_HEAD(pagelist);
+
+ if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ || mode > MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+
+ new = mpol_new(mode, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode,nodes_addr(nodes)[0]);
+
+ down_write(&mm->mmap_sem);
+ vma = check_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ err = PTR_ERR(vma);
+ if (!IS_ERR(vma)) {
+ int nr_failed = 0;
+
+ err = mbind_range(vma, start, end, new);
+
+ if (!list_empty(&pagelist))
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma);
+
+ if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ }
+
+ up_write(&mm->mmap_sem);
+ mpol_free(new);
+ return err;
+}
+
+/*
+ * User space interface with variable sized bitmaps for nodelists.
+ */
+
+/* Copy a node mask from user space. */
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ unsigned long k;
+ unsigned long nlongs;
+ unsigned long endmask;
+
+ --maxnode;
+ nodes_clear(*nodes);
+ if (maxnode == 0 || !nmask)
+ return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
+
+ nlongs = BITS_TO_LONGS(maxnode);
+ if ((maxnode % BITS_PER_LONG) == 0)
+ endmask = ~0UL;
+ else
+ endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+ /* When the user specified more nodes than supported just check
+ if the non supported part is all zero. */
+ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+ if (nlongs > PAGE_SIZE/sizeof(long))
+ return -EINVAL;
+ for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+ unsigned long t;
+ if (get_user(t, nmask + k))
+ return -EFAULT;
+ if (k == nlongs - 1) {
+ if (t & endmask)
+ return -EINVAL;
+ } else if (t)
+ return -EINVAL;
+ }
+ nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+ endmask = ~0UL;
+ }
+
+ if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
+ return -EFAULT;
+ nodes_addr(*nodes)[nlongs-1] &= endmask;
+ return 0;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
+ nodemask_t *nodes)
+{
+ unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+ const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+
+ if (copy > nbytes) {
+ if (copy > PAGE_SIZE)
+ return -EINVAL;
+ if (clear_user((char __user *)mask + nbytes, copy - nbytes))
+ return -EFAULT;
+ copy = nbytes;
+ }
+ return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
+}
+
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+ unsigned long mode,
+ unsigned long __user *nmask, unsigned long maxnode,
+ unsigned flags)
+{
+ nodemask_t nodes;
+ int err;
+
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+#ifdef CONFIG_CPUSETS
+ /* Restrict the nodes to the allowed nodes in the cpuset */
+ nodes_and(nodes, nodes, current->mems_allowed);
+#endif
+ return do_mbind(start, len, mode, &nodes, flags);
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+ unsigned long maxnode)
+{
+ int err;
+ nodemask_t nodes;
+
+ if (mode < 0 || mode > MPOL_MAX)
+ return -EINVAL;
+ err = get_nodes(&nodes, nmask, maxnode);
+ if (err)
+ return err;
+ return do_set_mempolicy(mode, &nodes);
+}
+
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+ const unsigned long __user *old_nodes,
+ const unsigned long __user *new_nodes)
+{
+ struct mm_struct *mm;
+ struct task_struct *task;
+ nodemask_t old;
+ nodemask_t new;
+ nodemask_t task_nodes;
+ int err;
+
+ err = get_nodes(&old, old_nodes, maxnode);
+ if (err)
+ return err;
+
+ err = get_nodes(&new, new_nodes, maxnode);
+ if (err)
+ return err;
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+ task = pid ? find_task_by_pid(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ mm = get_task_mm(task);
+ read_unlock(&tasklist_lock);
+
+ if (!mm)
+ return -EINVAL;
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. The right exists if the process has administrative
+ * capabilities, superuser privileges or the same
+ * userid as the target process.
+ */
+ if ((current->euid != task->suid) && (current->euid != task->uid) &&
+ (current->uid != task->suid) && (current->uid != task->uid) &&
+ !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ task_nodes = cpuset_mems_allowed(task);
+ /* Is the user allowed to access the target nodes? */
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
+ err = do_migrate_pages(mm, &old, &new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+out:
+ mmput(mm);
+ return err;
+}
+
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int __user *policy,
+ unsigned long __user *nmask,
+ unsigned long maxnode,
+ unsigned long addr, unsigned long flags)
+{
+ int err, pval;
+ nodemask_t nodes;
+
+ if (nmask != NULL && maxnode < MAX_NUMNODES)
+ return -EINVAL;
+
+ err = do_get_mempolicy(&pval, &nodes, addr, flags);