X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmempolicy.c;fp=mm%2Fmempolicy.c;h=36d7ebeff141f42404c410c4f58ea0bd608c12fb;hb=64ba3f394c830ec48a1c31b53dcae312c56f1604;hp=a9963ceddd65c483f589efe4f5c5124bddd36e8c;hpb=be1e6109ac94a859551f8e1774eb9a8469fe055c;p=linux-2.6.git

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a9963cedd..36d7ebeff 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,9 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
-#include <linux/migrate.h>
-#include <linux/rmap.h>
-#include <linux/security.h>
+#include <linux/vs_cvirt.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -98,8 +96,11 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
-static struct kmem_cache *policy_cache;
-static struct kmem_cache *sn_cache;
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
+static kmem_cache_t *policy_cache;
+static kmem_cache_t *sn_cache;
 
 #define PDprintk(fmt...)
 
@@ -331,10 +332,17 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	struct vm_area_struct *first, *vma, *prev;
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+		/* Must have swap device for migration */
+		if (nr_swap_pages <= 0)
+			return ERR_PTR(-ENODEV);
 
-		err = migrate_prep();
-		if (err)
-			return ERR_PTR(err);
+		/*
+		 * Clear the LRU lists so pages can be isolated.
+		 * Note that pages may be moved off the LRU after we have
+		 * drained them. Those pages will fail to migrate like other
+		 * pages that may be busy.
+		 */
+		lru_add_drain_all();
 	}
 
 	first = find_vma(mm, start);
@@ -424,37 +432,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
-
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy.  Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-	if (p->mempolicy)
-		p->flags |= PF_MEMPOLICY;
-	else
-		p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
-	mpol_fix_fork_child_flag(current);
-}
-
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -467,7 +444,6 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
-	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
@@ -575,23 +551,92 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
-#ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
+
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
-		isolate_lru_page(page, pagelist);
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+		if (isolate_lru_page(page))
+			list_add_tail(&page->lru, pagelist);
+	}
 }
 
-static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
 {
-	return alloc_pages_node(node, GFP_HIGHUSER, 0);
+	LIST_HEAD(newlist);
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int err = 0;
+	unsigned long offset = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
+
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
 }
 
 /*
@@ -610,9 +655,11 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 	check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
-	if (!list_empty(&pagelist))
-		err = migrate_pages(&pagelist, new_node_page, dest);
-
+	if (!list_empty(&pagelist)) {
+		err = migrate_pages_to(&pagelist, NULL, dest);
+		if (!list_empty(&pagelist))
+			putback_lru_pages(&pagelist);
+	}
 	return err;
 }
 
@@ -632,10 +679,6 @@ int do_migrate_pages(struct mm_struct *mm,
 
   	down_read(&mm->mmap_sem);
 
-	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
-	if (err)
-		goto out;
-
 /*
  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -695,39 +738,13 @@ int do_migrate_pages(struct mm_struct *mm,
 		if (err < 0)
 			break;
 	}
-out:
+
 	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
 	return busy;
-
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
-{
-	struct vm_area_struct *vma = (struct vm_area_struct *)private;
-
-	return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
-}
-#else
-
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
-				unsigned long flags)
-{
-}
-
-int do_migrate_pages(struct mm_struct *mm,
-	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
-{
-	return -ENOSYS;
-}
-
-static struct page *new_vma_page(struct page *page, unsigned long private)
-{
-	return NULL;
-}
-#endif
-
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -787,12 +804,13 @@ long do_mbind(unsigned long start, unsigned long len,
 		err = mbind_range(vma, start, end, new);
 
 		if (!list_empty(&pagelist))
-			nr_failed = migrate_pages(&pagelist, new_vma_page,
-						(unsigned long)vma);
+			nr_failed = migrate_pages_to(&pagelist, vma, -1);
 
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
+	if (!list_empty(&pagelist))
+		putback_lru_pages(&pagelist);
 
 	up_write(&mm->mmap_sem);
 	mpol_free(new);
@@ -930,7 +948,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	/*
 	 * Check if this process has the right to modify the specified
 	 * process. The right exists if the process has administrative
-	 * capabilities, superuser privileges or the same
+	 * capabilities, superuser priviledges or the same
 	 * userid as the target process.
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
@@ -947,10 +965,6 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		goto out;
 	}
 
-	err = security_task_movememory(task);
-	if (err)
-		goto out;
-
 	err = do_migrate_pages(mm, &old, &new,
 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
@@ -1176,15 +1190,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 	if (vma) {
 		unsigned long off;
 
-		/*
-		 * for small pages, there is no difference between
-		 * shift and PAGE_SHIFT, so the bit-shift is safe.
-		 * for huge pages, since vm_pgoff is in units of small
-		 * pages, we need to shift off the always 0 bits to get
-		 * a useful offset.
-		 */
-		BUG_ON(shift < PAGE_SHIFT);
-		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
+		off = vma->vm_pgoff;
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
@@ -1217,8 +1223,10 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 
 	zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zl->zones[0])
-		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
+	if (page && page_zone(page) == zl->zones[0]) {
+		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
+		put_cpu();
+	}
 	return page;
 }
 
@@ -1827,7 +1835,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 
 int show_numa_map(struct seq_file *m, void *v)
 {
-	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = m->private;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md;
 	struct file *file = vma->vm_file;
@@ -1843,7 +1851,7 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	mpol_to_str(buffer, sizeof(buffer),
-			    get_vma_policy(priv->task, vma, vma->vm_start));
+			get_vma_policy(task, vma, vma->vm_start));
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1897,7 +1905,7 @@ out:
 	kfree(md);
 
 	if (m->count < m->size)
-		m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
 	return 0;
 }