X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=a9963ceddd65c483f589efe4f5c5124bddd36e8c;hb=16c70f8c1b54b61c3b951b6fb220df250fe09b32;hp=d06eabbf74f0b0d2a1b2053fe784fed58e534368;hpb=9bf4aaab3e101692164d49b7ca357651eb691cb6;p=linux-2.6.git diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d06eabbf7..a9963cedd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -17,13 +18,19 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. + * * bind Only allocate memory on a specific set of nodes, * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -66,6 +73,8 @@ #include #include #include +#include +#include #include #include #include @@ -74,39 +83,39 @@ #include #include #include +#include +#include +#include +#include +#include +#include + +#include #include -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +/* Internal flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ + +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) /* Highest zone. An specific allocation for a zone below that is not policied. */ -static int policy_zone; +int policy_zone = ZONE_DMA; -static struct mempolicy default_policy = { +struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .policy = MPOL_DEFAULT, }; -/* Check if all specified nodes are online */ -static int nodes_online(unsigned long *nodes) -{ - DECLARE_BITMAP(online2, MAX_NUMNODES); - - bitmap_copy(online2, node_online_map, MAX_NUMNODES); - if (bitmap_empty(online2, MAX_NUMNODES)) - set_bit(0, online2); - if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) - return -EINVAL; - return 0; -} - /* Do sanity checking on a policy */ -static int mpol_check_policy(int mode, unsigned long *nodes) +static int mpol_check_policy(int mode, nodemask_t *nodes) { - int empty = bitmap_empty(nodes, MAX_NUMNODES); + int empty = nodes_empty(*nodes); switch (mode) { case MPOL_DEFAULT: @@ -121,86 +130,41 @@ static int mpol_check_policy(int mode, unsigned long *nodes) return -EINVAL; break; } - return nodes_online(nodes); -} - -/* Copy a node mask from user space. */ -static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, - unsigned long maxnode, int mode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - bitmap_zero(nodes, MAX_NUMNODES); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes[nlongs-1] &= endmask; - return mpol_check_policy(mode, nodes); + return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } /* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(unsigned long *nodes) +static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; - int num, max, nd; + int num, max, nd, k; - max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); - zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); + zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) return NULL; num = 0; - for (nd = find_first_bit(nodes, MAX_NUMNODES); - nd < MAX_NUMNODES; - nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { - int k; - for (k = MAX_NR_ZONES-1; k >= 0; k--) { + /* First put in the highest zones from all nodes, then all the next + lower zones etc. Avoid empty zones because the memory allocator + doesn't like them. If you implement node hot removal you + have to fix that. */ + for (k = policy_zone; k >= 0; k--) { + for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; - if (!z->present_pages) - continue; - zl->zones[num++] = z; - if (k > policy_zone) - policy_zone = k; + if (z->present_pages > 0) + zl->zones[num++] = z; } } - BUG_ON(num >= max); zl->zones[num] = NULL; return zl; } /* Create a new policy */ -static struct mempolicy *mpol_new(int mode, unsigned long *nodes) +static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -209,10 +173,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) atomic_set(&policy->refcnt, 1); switch (mode) { case MPOL_INTERLEAVE: - bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); + policy->v.nodes = *nodes; + if (nodes_weight(*nodes) == 0) { + kmem_cache_free(policy_cache, policy); + return ERR_PTR(-EINVAL); + } break; case MPOL_PREFERRED: - policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); + policy->v.preferred_node = first_node(*nodes); if (policy->v.preferred_node >= MAX_NUMNODES) policy->v.preferred_node = -1; break; @@ -225,61 +193,173 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) break; } policy->policy = mode; + policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } -/* Ensure all existing pages follow the policy. */ -static int -verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes) +static void gather_stats(struct page *, void *, int pte_dirty); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); + +/* Scan through pages checking if pages follow certain conditions. */ +static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) { - while (addr < end) { - struct page *p; - pte_t *pte; - pmd_t *pmd; - pgd_t *pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) { - addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + pte_t *orig_pte; + pte_t *pte; + spinlock_t *ptl; + + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + do { + struct page *page; + unsigned int nid; + + if (!pte_present(*pte)) continue; - } - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) { - addr = (addr + PMD_SIZE) & PMD_MASK; + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - p = NULL; - pte = pte_offset_map(pmd, addr); - if (pte_present(*pte)) - p = pte_page(*pte); - pte_unmap(pte); - if (p) { - unsigned nid = page_to_nid(p); - if (!test_bit(nid, nodes)) - return -EIO; - } - addr += PAGE_SIZE; - } + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ + if (PageReserved(page)) + continue; + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & MPOL_MF_STATS) + gather_stats(page, private, pte_dirty(*pte)); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); + else + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return addr != end; +} + +static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + if (check_pte_range(vma, pmd, addr, next, nodes, + flags, private)) + return -EIO; + } while (pmd++, addr = next, addr != end); return 0; } -/* Step 1: check the range */ +static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + if (check_pmd_range(vma, pud, addr, next, nodes, + flags, private)) + return -EIO; + } while (pud++, addr = next, addr != end); + return 0; +} + +static inline int check_pgd_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ + pgd_t *pgd; + unsigned long next; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + if (check_pud_range(vma, pgd, addr, next, nodes, + flags, private)) + return -EIO; + } while (pgd++, addr = next, addr != end); + return 0; +} + +/* Check if a vma is migratable */ +static inline int vma_migratable(struct vm_area_struct *vma) +{ + if (vma->vm_flags & ( + VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) + return 0; + return 1; +} + +/* + * Check if all pages in a range are on a set of nodes. + * If pagelist != NULL then isolate pages from the LRU and + * put them on the pagelist. + */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - unsigned long *nodes, unsigned long flags) + const nodemask_t *nodes, unsigned long flags, void *private) { int err; struct vm_area_struct *first, *vma, *prev; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return ERR_PTR(err); + } + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { - if (!vma->vm_next && vma->vm_end < end) - return ERR_PTR(-EFAULT); - if (prev && prev->vm_end < vma->vm_start) - return ERR_PTR(-EFAULT); - if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { - err = verify_pages(vma->vm_start, vma->vm_end, nodes); + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return ERR_PTR(-EFAULT); + if (prev && prev->vm_end < vma->vm_start) + return ERR_PTR(-EFAULT); + } + if (!is_vm_hugetlb_page(vma) && + ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma)))) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + err = check_pgd_range(vma, start, endvma, nodes, + flags, private); if (err) { first = ERR_PTR(err); break; @@ -333,98 +413,89 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -/* Change policy for a memory range */ -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +static int contextualize_policy(int mode, nodemask_t *nodes) { - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct mempolicy *new; - unsigned long end; - DECLARE_BITMAP(nodes, MAX_NUMNODES); - int err; + if (!nodes) + return 0; - if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) - return -EINVAL; - if (start & ~PAGE_MASK) + cpuset_update_task_memory_state(); + if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) return -EINVAL; - if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; - end = start + len; - if (end < start) - return -EINVAL; - if (end == start) - return 0; + return mpol_check_policy(mode, nodes); +} - err = get_nodes(nodes, nmask, maxnode, mode); - if (err) - return err; - new = mpol_new(mode, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes[0]); +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} - down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nodes, flags); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) - err = mbind_range(vma, start, end, new); - up_write(&mm->mmap_sem); - mpol_free(new); - return err; +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +long do_set_mempolicy(int mode, nodemask_t *nodes) { - int err; struct mempolicy *new; - DECLARE_BITMAP(nodes, MAX_NUMNODES); - if (mode > MPOL_MAX) + if (contextualize_policy(mode, nodes)) return -EINVAL; - err = get_nodes(nodes, nmask, maxnode, mode); - if (err) - return err; new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) - current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + current->il_next = first_node(new->v.nodes); return 0; } /* Fill a zone bitmap for a policy */ -static void get_zonemask(struct mempolicy *p, unsigned long *nodes) +static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) { int i; - bitmap_zero(nodes, MAX_NUMNODES); + nodes_clear(*nodes); switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + *nodes); break; case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); + *nodes = p->v.nodes; break; case MPOL_PREFERRED: /* or use current node instead of online map? */ if (p->v.preferred_node < 0) - bitmap_copy(nodes, node_online_map, MAX_NUMNODES); + *nodes = node_online_map; else - __set_bit(p->v.preferred_node, nodes); + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -438,43 +509,24 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { - err = page_zone(p)->zone_pgdat->node_id; + err = page_to_nid(p); put_page(p); } return err; } -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - void *nodes, unsigned nbytes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; -} - /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { - int err, pval; + int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; + cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; - if (nmask != NULL && maxnode < numnodes) - return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -497,60 +549,530 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = lookup_node(mm, addr); if (err < 0) goto out; - pval = err; + *policy = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - pval = current->il_next; + *policy = current->il_next; } else { err = -EINVAL; goto out; } } else - pval = pol->policy; + *policy = pol->policy; - err = -EFAULT; - if (policy && put_user(pval, policy)) - goto out; + if (vma) { + up_read(¤t->mm->mmap_sem); + vma = NULL; + } err = 0; - if (nmask) { - DECLARE_BITMAP(nodes, MAX_NUMNODES); - get_zonemask(pol, nodes); - err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); + if (nmask) + get_zonemask(pol, nmask); + + out: + if (vma) + up_read(¤t->mm->mmap_sem); + return err; +} + +#ifdef CONFIG_MIGRATION +/* + * page migration + */ +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); +} + +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + return alloc_pages_node(node, GFP_HIGHUSER, 0); +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) +{ + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; + + nodes_clear(nmask); + node_set(source, nmask); + + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_node_page, dest); + + return err; +} + +/* + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + LIST_HEAD(pagelist); + int busy = 0; + int err = 0; + nodemask_t tmp; + + down_read(&mm->mmap_sem); + + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; + +/* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from_nodes; + while (!nodes_empty(tmp)) { + int s,d; + int source = -1; + int dest = 0; + + for_each_node_mask(s, tmp) { + d = node_remap(s, *from_nodes, *to_nodes); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == -1) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } +out: + up_read(&mm->mmap_sem); + if (err < 0) + return err; + return busy; + +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + + return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} + +static struct page *new_vma_page(struct page *page, unsigned long private) +{ + return NULL; +} +#endif + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + + new = mpol_new(mode, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes_addr(nodes)[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + + if (!list_empty(&pagelist)) + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; } - out: - if (vma) - up_read(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + nodemask_t nodes; + int err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, &nodes, flags); +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + nodemask_t nodes; + + if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, &nodes); +} + +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + err = security_task_movememory(task); + if (err) + goto out; + + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + return err; } #ifdef CONFIG_COMPAT -/* The other functions are compatible */ -asmlinkage long compat_get_mempolicy(int __user *policy, - unsigned __user *nmask, unsigned maxnode, - unsigned addr, unsigned flags) + +asmlinkage long compat_sys_get_mempolicy(int __user *policy, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, + compat_ulong_t addr, compat_ulong_t flags) { long err; unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (nmask) - nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8); - err = sys_get_mempolicy(policy, nm, maxnode, addr, flags); - if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8)) - err = -EFAULT; + nm = compat_alloc_user_space(alloc_size); + + err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); + + if (!err && nmask) { + err = copy_from_user(bm, nm, alloc_size); + /* ensure entire bitmap is zeroed */ + err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); + err |= compat_put_bitmap(nmask, bm, nr_bits); + } + return err; } + +asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(bm, nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, bm, alloc_size); + } + + if (err) + return -EFAULT; + + return sys_set_mempolicy(mode, nm, nr_bits+1); +} + +asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, + compat_ulong_t mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode, compat_ulong_t flags) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + nodemask_t bm; + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, nodes_addr(bm), alloc_size); + } + + if (err) + return -EFAULT; + + return sys_mbind(start, len, mode, nm, nr_bits+1, flags); +} + #endif /* Return effective policy for a VMA */ -static struct mempolicy * -get_vma_policy(struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy * get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = task->mempolicy; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -561,7 +1083,7 @@ get_vma_policy(struct vm_area_struct *vma, unsigned long addr) } /* Return a zonelist representing a mempolicy */ -static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) +static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) { int nd; @@ -573,8 +1095,10 @@ static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) break; case MPOL_BIND: /* Lower zones don't get a policy applied */ - if (gfp >= policy_zone) - return policy->v.zonelist; + /* Careful: current->mems_allowed might have moved */ + if (gfp_zone(gfp) >= policy_zone) + if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) + return policy->v.zonelist; /*FALL THROUGH*/ case MPOL_INTERLEAVE: /* should not happen */ case MPOL_DEFAULT: @@ -584,7 +1108,7 @@ static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) nd = 0; BUG(); } - return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK); + return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); } /* Do dynamic interleaving for a process */ @@ -594,47 +1118,107 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - BUG_ON(nid >= MAX_NUMNODES); - next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); + next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) - next = find_first_bit(policy->v.nodes, MAX_NUMNODES); + next = first_node(policy->v.nodes); me->il_next = next; return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + switch (policy->policy) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return policy->v.zonelist->zones[0]->zone_pgdat->node_id; + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) { - unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); + unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target = (unsigned)off % nnodes; int c; int nid = -1; c = 0; do { - nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); + nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); - BUG_ON(nid >= MAX_NUMNODES); - BUG_ON(!test_bit(nid, pol->v.nodes)); return nid; } +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +#ifdef CONFIG_HUGETLBFS +/* Return a zonelist suitable for a huge page allocation. */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + if (pol->policy == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + } + return zonelist_policy(GFP_HIGHUSER, pol); +} +#endif + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid) +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) { struct zonelist *zl; struct page *page; - BUG_ON(!test_bit(nid, node_online_map)); - zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); + zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zl->zones[0]->pageset[get_cpu()].interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -661,23 +1245,16 @@ static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = get_vma_policy(vma, addr); + struct mempolicy *pol = get_vma_policy(current, vma, addr); + + cpuset_update_task_memory_state(); if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; - if (vma) { - unsigned long off; - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); - off = vma->vm_pgoff; - off += (addr - vma->vm_start) >> PAGE_SHIFT; - nid = offset_il_node(pol, vma, off); - } else { - /* fall back to process interleaving */ - nid = interleave_nodes(pol); - } + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); @@ -697,11 +1274,17 @@ alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr) * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. + * + * Don't call cpuset_update_task_memory_state() unless + * 1) it's ok to take cpuset_sem (can WAIT), and + * 2) allocating for current task (not interrupt). */ -struct page *alloc_pages_current(unsigned gfp, unsigned order) +struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + if ((gfp & __GFP_WAIT) && !in_interrupt()) + cpuset_update_task_memory_state(); if (!pol || in_interrupt()) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) @@ -710,6 +1293,15 @@ struct page *alloc_pages_current(unsigned gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ +void *cpuset_being_rebound; + /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) { @@ -717,6 +1309,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } *new = *old; atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { @@ -742,7 +1338,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_DEFAULT: return 1; case MPOL_INTERLEAVE: - return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); + return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; case MPOL_BIND: { @@ -769,65 +1365,17 @@ void __mpol_free(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -/* - * Hugetlb policy. Same as above, just works with node numbers instead of - * zonelists. - */ - -/* Find first node suitable for an allocation */ -int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(vma, addr); - - switch (pol->policy) { - case MPOL_DEFAULT: - return numa_node_id(); - case MPOL_BIND: - return pol->v.zonelist->zones[0]->zone_pgdat->node_id; - case MPOL_INTERLEAVE: - return interleave_nodes(pol); - case MPOL_PREFERRED: - return pol->v.preferred_node >= 0 ? - pol->v.preferred_node : numa_node_id(); - } - BUG(); - return 0; -} - -/* Find secondary valid nodes for an allocation */ -int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(vma, addr); - - switch (pol->policy) { - case MPOL_PREFERRED: - case MPOL_DEFAULT: - case MPOL_INTERLEAVE: - return 1; - case MPOL_BIND: { - struct zone **z; - for (z = pol->v.zonelist->zones; *z; z++) - if ((*z)->zone_pgdat->node_id == nid) - return 1; - return 0; - } - default: - BUG(); - return 0; - } -} - /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->sem semaphore, which should be held + * They are protected by the sp->lock spinlock, which should be held * for any accesses to the tree. */ /* lookup first element intersecting start-end */ -/* Caller holds sp->sem */ +/* Caller holds sp->lock */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -835,13 +1383,13 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); - if (start >= p->end) { + + if (start >= p->end) n = n->rb_right; - } else if (end < p->start) { + else if (end <= p->start) n = n->rb_left; - } else { + else break; - } } if (!n) return NULL; @@ -859,7 +1407,7 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) } /* Insert a new shared policy into the list. */ -/* Caller holds sp->sem */ +/* Caller holds sp->lock */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -889,13 +1437,15 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) struct mempolicy *pol = NULL; struct sp_node *sn; - down(&sp->sem); + if (!sp->root.rb_node) + return NULL; + spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - up(&sp->sem); + spin_unlock(&sp->lock); return pol; } @@ -925,9 +1475,10 @@ sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2; + struct sp_node *n, *new2 = NULL; - down(&sp->sem); +restart: + spin_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -940,16 +1491,18 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } else { /* Old policy spanning whole new range. */ if (n->end > end) { - new2 = sp_alloc(end, n->end, n->policy); if (!new2) { - up(&sp->sem); - return -ENOMEM; + spin_unlock(&sp->lock); + new2 = sp_alloc(end, n->end, n->policy); + if (!new2) + return -ENOMEM; + goto restart; } - n->end = end; + n->end = start; sp_insert(sp, new2); - } - /* Old crossing beginning, but not end (easy) */ - if (n->start < start && n->end > start) + new2 = NULL; + break; + } else n->end = start; } if (!next) @@ -958,10 +1511,38 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, } if (new) sp_insert(sp, new); - up(&sp->sem); + spin_unlock(&sp->lock); + if (new2) { + mpol_free(new2->policy); + kmem_cache_free(sn_cache, new2); + } return 0; } +void mpol_shared_policy_init(struct shared_policy *info, int policy, + nodemask_t *policy_nodes) +{ + info->root = RB_ROOT; + spin_lock_init(&info->lock); + + if (policy != MPOL_DEFAULT) { + struct mempolicy *newpol; + + /* Falls back to MPOL_DEFAULT on any error */ + newpol = mpol_new(policy, policy_nodes); + if (!IS_ERR(newpol)) { + /* Create pseudo-vma that contains just the policy */ + struct vm_area_struct pvma; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + /* Policy covers entire file */ + pvma.vm_end = TASK_SIZE; + mpol_set_shared_policy(info, &pvma, newpol); + mpol_free(newpol); + } + } +} + int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { @@ -972,7 +1553,7 @@ int mpol_set_shared_policy(struct shared_policy *info, PDprintk("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? npol->v.nodes[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -991,7 +1572,9 @@ void mpol_free_shared_policy(struct shared_policy *p) struct sp_node *n; struct rb_node *next; - down(&p->sem); + if (!p->root.rb_node) + return; + spin_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); @@ -1000,7 +1583,7 @@ void mpol_free_shared_policy(struct shared_policy *p) mpol_free(n->policy); kmem_cache_free(sn_cache, n); } - up(&p->sem); + spin_unlock(&p->lock); } /* assumes fs == KERNEL_DS */ @@ -1017,13 +1600,304 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (sys_set_mempolicy(MPOL_INTERLEAVE, node_online_map, MAX_NUMNODES) < 0) + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default. - * Assumes fs == KERNEL_DS */ +/* Reset policy of current process to default */ void numa_default_policy(void) { - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); + do_set_mempolicy(MPOL_DEFAULT, NULL); +} + +/* Migrate a policy to a different set of nodes */ +void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +{ + nodemask_t *mpolmask; + nodemask_t tmp; + + if (!pol) + return; + mpolmask = &pol->cpuset_mems_allowed; + if (nodes_equal(*mpolmask, *newmask)) + return; + + switch (pol->policy) { + case MPOL_DEFAULT: + break; + case MPOL_INTERLEAVE: + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); + pol->v.nodes = tmp; + *mpolmask = *newmask; + current->il_next = node_remap(current->il_next, + *mpolmask, *newmask); + break; + case MPOL_PREFERRED: + pol->v.preferred_node = node_remap(pol->v.preferred_node, + *mpolmask, *newmask); + *mpolmask = *newmask; + break; + case MPOL_BIND: { + nodemask_t nodes; + struct zone **z; + struct zonelist *zonelist; + + nodes_clear(nodes); + for (z = pol->v.zonelist->zones; *z; z++) + node_set((*z)->zone_pgdat->node_id, nodes); + nodes_remap(tmp, nodes, *mpolmask, *newmask); + nodes = tmp; + + zonelist = bind_zonelist(&nodes); + + /* If no mem, then zonelist is NULL and we keep old zonelist. + * If that old zonelist has no remaining mems_allowed nodes, + * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. + */ + + if (zonelist) { + /* Good - got mem - substitute new zonelist */ + kfree(pol->v.zonelist); + pol->v.zonelist = zonelist; + } + *mpolmask = *newmask; + break; + } + default: + BUG(); + break; + } +} + +/* + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); +} + +/* + * Display pages allocated per node and memory policy via /proc. + */ + +static const char *policy_types[] = { "default", "prefer", "bind", + "interleave" }; + +/* + * Convert a mempolicy into a string. + * Returns the number of characters in buffer (if positive) + * or an error (negative) + */ +static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +{ + char *p = buffer; + int l; + nodemask_t nodes; + int mode = pol ? pol->policy : MPOL_DEFAULT; + + switch (mode) { + case MPOL_DEFAULT: + nodes_clear(nodes); + break; + + case MPOL_PREFERRED: + nodes_clear(nodes); + node_set(pol->v.preferred_node, nodes); + break; + + case MPOL_BIND: + get_zonemask(pol, &nodes); + break; + + case MPOL_INTERLEAVE: + nodes = pol->v.nodes; + break; + + default: + BUG(); + return -EFAULT; + } + + l = strlen(policy_types[mode]); + if (buffer + maxlen < p + l + 1) + return -ENOSPC; + + strcpy(p, policy_types[mode]); + p += l; + + if (!nodes_empty(nodes)) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = '='; + p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); + } + return p - buffer; +} + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +static void gather_stats(struct page *page, void *private, int pte_dirty) +{ + struct numa_maps *md = private; + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + +int show_numa_map(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + int n; + char buffer[50]; + + if (!mm) + return 0; + + md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); + if (!md) + return 0; + + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(priv->task, vma, vma->vm_start)); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + } + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m," anon=%lu",md->anon); + + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + kfree(md); + + if (m->count < m->size) + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + return 0; } +