X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmempolicy.c;fp=mm%2Fmempolicy.c;h=36d7ebeff141f42404c410c4f58ea0bd608c12fb;hb=64ba3f394c830ec48a1c31b53dcae312c56f1604;hp=a9963ceddd65c483f589efe4f5c5124bddd36e8c;hpb=be1e6109ac94a859551f8e1774eb9a8469fe055c;p=linux-2.6.git diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a9963cedd..36d7ebeff 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,9 +86,7 @@ #include #include #include -#include -#include -#include +#include #include #include @@ -98,8 +96,11 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -static struct kmem_cache *policy_cache; -static struct kmem_cache *sn_cache; +/* The number of pages to migrate per call to migrate_pages() */ +#define MIGRATE_CHUNK_SIZE 256 + +static kmem_cache_t *policy_cache; +static kmem_cache_t *sn_cache; #define PDprintk(fmt...) @@ -331,10 +332,17 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return ERR_PTR(-ENODEV); - err = migrate_prep(); - if (err) - return ERR_PTR(err); + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); } first = find_vma(mm, start); @@ -424,37 +432,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } - -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ long do_set_mempolicy(int mode, nodemask_t *nodes) { @@ -467,7 +444,6 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -575,23 +551,92 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } -#ifdef CONFIG_MIGRATION /* * page migration */ + static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) - isolate_lru_page(page, pagelist); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (isolate_lru_page(page)) + list_add_tail(&page->lru, pagelist); + } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +static int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) { - return alloc_pages_node(node, GFP_HIGHUSER, 0); + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; } /* @@ -610,9 +655,11 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (!list_empty(&pagelist)) - err = migrate_pages(&pagelist, new_node_page, dest); - + if (!list_empty(&pagelist)) { + err = migrate_pages_to(&pagelist, NULL, dest); + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); + } return err; } @@ -632,10 +679,6 @@ int do_migrate_pages(struct mm_struct *mm, down_read(&mm->mmap_sem); - err = migrate_vmas(mm, from_nodes, to_nodes, flags); - if (err) - goto out; - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -695,39 +738,13 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) break; } -out: + up_read(&mm->mmap_sem); if (err < 0) return err; return busy; - } -static struct page *new_vma_page(struct page *page, unsigned long private, int **x) -{ - struct vm_area_struct *vma = (struct vm_area_struct *)private; - - return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); -} -#else - -static void migrate_page_add(struct page *page, struct list_head *pagelist, - unsigned long flags) -{ -} - -int do_migrate_pages(struct mm_struct *mm, - const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) -{ - return -ENOSYS; -} - -static struct page *new_vma_page(struct page *page, unsigned long private) -{ - return NULL; -} -#endif - long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -787,12 +804,13 @@ long do_mbind(unsigned long start, unsigned long len, err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) - nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma); + nr_failed = migrate_pages_to(&pagelist, vma, -1); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); mpol_free(new); @@ -930,7 +948,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same + * capabilities, superuser priviledges or the same * userid as the target process. */ if ((current->euid != task->suid) && (current->euid != task->uid) && @@ -947,10 +965,6 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } - err = security_task_movememory(task); - if (err) - goto out; - err = do_migrate_pages(mm, &old, &new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: @@ -1176,15 +1190,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. - */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); + off = vma->vm_pgoff; off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else @@ -1217,8 +1223,10 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) - inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); + if (page && page_zone(page) == zl->zones[0]) { + zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; + put_cpu(); + } return page; } @@ -1827,7 +1835,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, int show_numa_map(struct seq_file *m, void *v) { - struct proc_maps_private *priv = m->private; + struct task_struct *task = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1843,7 +1851,7 @@ int show_numa_map(struct seq_file *m, void *v) return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(priv->task, vma, vma->vm_start)); + get_vma_policy(task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1897,7 +1905,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; return 0; }