#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
-#include <linux/migrate.h>
-#include <linux/rmap.h>
-#include <linux/security.h>
+#include <linux/vs_cvirt.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
-static struct kmem_cache *policy_cache;
-static struct kmem_cache *sn_cache;
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
+static kmem_cache_t *policy_cache;
+static kmem_cache_t *sn_cache;
#define PDprintk(fmt...)
struct vm_area_struct *first, *vma, *prev;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ /* Must have swap device for migration */
+ if (nr_swap_pages <= 0)
+ return ERR_PTR(-ENODEV);
- err = migrate_prep();
- if (err)
- return ERR_PTR(err);
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
+ lru_add_drain_all();
}
first = find_vma(mm, start);
return mpol_check_policy(mode, nodes);
}
-
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
-}
-
/* Set the process memory policy */
long do_set_mempolicy(int mode, nodemask_t *nodes)
{
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
- mpol_set_task_struct_flag();
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
return err;
}
-#ifdef CONFIG_MIGRATION
/*
* page migration
*/
+
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
- isolate_lru_page(page, pagelist);
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (isolate_lru_page(page))
+ list_add_tail(&page->lru, pagelist);
+ }
}
-static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+ struct vm_area_struct *vma, int dest)
{
- return alloc_pages_node(node, GFP_HIGHUSER, 0);
+ LIST_HEAD(newlist);
+ LIST_HEAD(moved);
+ LIST_HEAD(failed);
+ int err = 0;
+ unsigned long offset = 0;
+ int nr_pages;
+ struct page *page;
+ struct list_head *p;
+
+redo:
+ nr_pages = 0;
+ list_for_each(p, pagelist) {
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
+ }
+ else
+ page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&page->lru, &newlist);
+ nr_pages++;
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
+ break;
+ }
+ err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+ putback_lru_pages(&moved); /* Call release pages instead ?? */
+
+ if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+ goto redo;
+out:
+ /* Return leftover allocated pages */
+ while (!list_empty(&newlist)) {
+ page = list_entry(newlist.next, struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ list_splice(&failed, pagelist);
+ if (err < 0)
+ return err;
+
+ /* Calculate number of leftover pages */
+ nr_pages = 0;
+ list_for_each(p, pagelist)
+ nr_pages++;
+ return nr_pages;
}
/*
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest);
-
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages_to(&pagelist, NULL, dest);
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+ }
return err;
}
down_read(&mm->mmap_sem);
- err = migrate_vmas(mm, from_nodes, to_nodes, flags);
- if (err)
- goto out;
-
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
if (err < 0)
break;
}
-out:
+
up_read(&mm->mmap_sem);
if (err < 0)
return err;
return busy;
-
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
-{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
-
- return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
-}
-#else
-
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
- unsigned long flags)
-{
-}
-
-int do_migrate_pages(struct mm_struct *mm,
- const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
-{
- return -ENOSYS;
-}
-
-static struct page *new_vma_page(struct page *page, unsigned long private)
-{
- return NULL;
-}
-#endif
-
long do_mbind(unsigned long start, unsigned long len,
unsigned long mode, nodemask_t *nmask, unsigned long flags)
{
err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma);
+ nr_failed = migrate_pages_to(&pagelist, vma, -1);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
}
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_free(new);
/*
* Check if this process has the right to modify the specified
* process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
+ * capabilities, superuser priviledges or the same
* userid as the target process.
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
goto out;
}
- err = security_task_movememory(task);
- if (err)
- goto out;
-
err = do_migrate_pages(mm, &old, &new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
if (vma) {
unsigned long off;
- /*
- * for small pages, there is no difference between
- * shift and PAGE_SHIFT, so the bit-shift is safe.
- * for huge pages, since vm_pgoff is in units of small
- * pages, we need to shift off the always 0 bits to get
- * a useful offset.
- */
- BUG_ON(shift < PAGE_SHIFT);
- off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
+ off = vma->vm_pgoff;
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, vma, off);
} else
zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0])
- inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
+ if (page && page_zone(page) == zl->zones[0]) {
+ zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
+ put_cpu();
+ }
return page;
}
int show_numa_map(struct seq_file *m, void *v)
{
- struct proc_maps_private *priv = m->private;
+ struct task_struct *task = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
struct file *file = vma->vm_file;
return 0;
mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(priv->task, vma, vma->vm_start));
+ get_vma_policy(task, vma, vma->vm_start));
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
kfree(md);
if (m->count < m->size)
- m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
+ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
return 0;
}