mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/vs_cvirt.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 #define PDprintk(fmt...)
 104
 105 /* Highest zone. An specific allocation for a zone below that is not
 106    policied. */
 107 int policy_zone = ZONE_DMA;
 108
 109 struct mempolicy default_policy = {
 110         .refcnt = ATOMIC_INIT(1), /* never free it */
 111         .policy = MPOL_DEFAULT,
 112 };
 113
 114 /* Do sanity checking on a policy */
 115 static int mpol_check_policy(int mode, nodemask_t *nodes)
 116 {
 117         int empty = nodes_empty(*nodes);
 118
 119         switch (mode) {
 120         case MPOL_DEFAULT:
 121                 if (!empty)
 122                         return -EINVAL;
 123                 break;
 124         case MPOL_BIND:
 125         case MPOL_INTERLEAVE:
 126                 /* Preferred will only use the first bit, but allow
 127                    more for now. */
 128                 if (empty)
 129                         return -EINVAL;
 130                 break;
 131         }
 132         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 133 }
 134
 135 /* Generate a custom zonelist for the BIND policy. */
 136 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 137 {
 138         struct zonelist *zl;
 139         int num, max, nd, k;
 140
 141         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 142         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 143         if (!zl)
 144                 return NULL;
 145         num = 0;
 146         /* First put in the highest zones from all nodes, then all the next
 147            lower zones etc. Avoid empty zones because the memory allocator
 148            doesn't like them. If you implement node hot removal you
 149            have to fix that. */
 150         for (k = policy_zone; k >= 0; k--) {
 151                 for_each_node_mask(nd, *nodes) {
 152                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 153                         if (z->present_pages > 0)
 154                                 zl->zones[num++] = z;
 155                 }
 156         }
 157         zl->zones[num] = NULL;
 158         return zl;
 159 }
 160
 161 /* Create a new policy */
 162 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 163 {
 164         struct mempolicy *policy;
 165
 166         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 167         if (mode == MPOL_DEFAULT)
 168                 return NULL;
 169         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 170         if (!policy)
 171                 return ERR_PTR(-ENOMEM);
 172         atomic_set(&policy->refcnt, 1);
 173         switch (mode) {
 174         case MPOL_INTERLEAVE:
 175                 policy->v.nodes = *nodes;
 176                 if (nodes_weight(*nodes) == 0) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-EINVAL);
 179                 }
 180                 break;
 181         case MPOL_PREFERRED:
 182                 policy->v.preferred_node = first_node(*nodes);
 183                 if (policy->v.preferred_node >= MAX_NUMNODES)
 184                         policy->v.preferred_node = -1;
 185                 break;
 186         case MPOL_BIND:
 187                 policy->v.zonelist = bind_zonelist(nodes);
 188                 if (policy->v.zonelist == NULL) {
 189                         kmem_cache_free(policy_cache, policy);
 190                         return ERR_PTR(-ENOMEM);
 191                 }
 192                 break;
 193         }
 194         policy->policy = mode;
 195         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 196         return policy;
 197 }
 198
 199 static void gather_stats(struct page *, void *, int pte_dirty);
 200 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 201                                 unsigned long flags);
 202
 203 /* Scan through pages checking if pages follow certain conditions. */
 204 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 205                 unsigned long addr, unsigned long end,
 206                 const nodemask_t *nodes, unsigned long flags,
 207                 void *private)
 208 {
 209         pte_t *orig_pte;
 210         pte_t *pte;
 211         spinlock_t *ptl;
 212
 213         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 214         do {
 215                 struct page *page;
 216                 unsigned int nid;
 217
 218                 if (!pte_present(*pte))
 219                         continue;
 220                 page = vm_normal_page(vma, addr, *pte);
 221                 if (!page)
 222                         continue;
 223                 /*
 224                  * The check for PageReserved here is important to avoid
 225                  * handling zero pages and other pages that may have been
 226                  * marked special by the system.
 227                  *
 228                  * If the PageReserved would not be checked here then f.e.
 229                  * the location of the zero page could have an influence
 230                  * on MPOL_MF_STRICT, zero pages would be counted for
 231                  * the per node stats, and there would be useless attempts
 232                  * to put zero pages on the migration list.
 233                  */
 234                 if (PageReserved(page))
 235                         continue;
 236                 nid = page_to_nid(page);
 237                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 238                         continue;
 239
 240                 if (flags & MPOL_MF_STATS)
 241                         gather_stats(page, private, pte_dirty(*pte));
 242                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 243                         migrate_page_add(page, private, flags);
 244                 else
 245                         break;
 246         } while (pte++, addr += PAGE_SIZE, addr != end);
 247         pte_unmap_unlock(orig_pte, ptl);
 248         return addr != end;
 249 }
 250
 251 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 252                 unsigned long addr, unsigned long end,
 253                 const nodemask_t *nodes, unsigned long flags,
 254                 void *private)
 255 {
 256         pmd_t *pmd;
 257         unsigned long next;
 258
 259         pmd = pmd_offset(pud, addr);
 260         do {
 261                 next = pmd_addr_end(addr, end);
 262                 if (pmd_none_or_clear_bad(pmd))
 263                         continue;
 264                 if (check_pte_range(vma, pmd, addr, next, nodes,
 265                                     flags, private))
 266                         return -EIO;
 267         } while (pmd++, addr = next, addr != end);
 268         return 0;
 269 }
 270
 271 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 272                 unsigned long addr, unsigned long end,
 273                 const nodemask_t *nodes, unsigned long flags,
 274                 void *private)
 275 {
 276         pud_t *pud;
 277         unsigned long next;
 278
 279         pud = pud_offset(pgd, addr);
 280         do {
 281                 next = pud_addr_end(addr, end);
 282                 if (pud_none_or_clear_bad(pud))
 283                         continue;
 284                 if (check_pmd_range(vma, pud, addr, next, nodes,
 285                                     flags, private))
 286                         return -EIO;
 287         } while (pud++, addr = next, addr != end);
 288         return 0;
 289 }
 290
 291 static inline int check_pgd_range(struct vm_area_struct *vma,
 292                 unsigned long addr, unsigned long end,
 293                 const nodemask_t *nodes, unsigned long flags,
 294                 void *private)
 295 {
 296         pgd_t *pgd;
 297         unsigned long next;
 298
 299         pgd = pgd_offset(vma->vm_mm, addr);
 300         do {
 301                 next = pgd_addr_end(addr, end);
 302                 if (pgd_none_or_clear_bad(pgd))
 303                         continue;
 304                 if (check_pud_range(vma, pgd, addr, next, nodes,
 305                                     flags, private))
 306                         return -EIO;
 307         } while (pgd++, addr = next, addr != end);
 308         return 0;
 309 }
 310
 311 /* Check if a vma is migratable */
 312 static inline int vma_migratable(struct vm_area_struct *vma)
 313 {
 314         if (vma->vm_flags & (
 315                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 316                 return 0;
 317         return 1;
 318 }
 319
 320 /*
 321  * Check if all pages in a range are on a set of nodes.
 322  * If pagelist != NULL then isolate pages from the LRU and
 323  * put them on the pagelist.
 324  */
 325 static struct vm_area_struct *
 326 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 327                 const nodemask_t *nodes, unsigned long flags, void *private)
 328 {
 329         int err;
 330         struct vm_area_struct *first, *vma, *prev;
 331
 332         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 333
 334                 err = migrate_prep();
 335                 if (err)
 336                         return ERR_PTR(err);
 337         }
 338
 339         first = find_vma(mm, start);
 340         if (!first)
 341                 return ERR_PTR(-EFAULT);
 342         prev = NULL;
 343         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 344                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 345                         if (!vma->vm_next && vma->vm_end < end)
 346                                 return ERR_PTR(-EFAULT);
 347                         if (prev && prev->vm_end < vma->vm_start)
 348                                 return ERR_PTR(-EFAULT);
 349                 }
 350                 if (!is_vm_hugetlb_page(vma) &&
 351                     ((flags & MPOL_MF_STRICT) ||
 352                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 353                                 vma_migratable(vma)))) {
 354                         unsigned long endvma = vma->vm_end;
 355
 356                         if (endvma > end)
 357                                 endvma = end;
 358                         if (vma->vm_start > start)
 359                                 start = vma->vm_start;
 360                         err = check_pgd_range(vma, start, endvma, nodes,
 361                                                 flags, private);
 362                         if (err) {
 363                                 first = ERR_PTR(err);
 364                                 break;
 365                         }
 366                 }
 367                 prev = vma;
 368         }
 369         return first;
 370 }
 371
 372 /* Apply policy to a single VMA */
 373 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 374 {
 375         int err = 0;
 376         struct mempolicy *old = vma->vm_policy;
 377
 378         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 379                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 380                  vma->vm_ops, vma->vm_file,
 381                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 382
 383         if (vma->vm_ops && vma->vm_ops->set_policy)
 384                 err = vma->vm_ops->set_policy(vma, new);
 385         if (!err) {
 386                 mpol_get(new);
 387                 vma->vm_policy = new;
 388                 mpol_free(old);
 389         }
 390         return err;
 391 }
 392
 393 /* Step 2: apply policy to a range and do splits. */
 394 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 395                        unsigned long end, struct mempolicy *new)
 396 {
 397         struct vm_area_struct *next;
 398         int err;
 399
 400         err = 0;
 401         for (; vma && vma->vm_start < end; vma = next) {
 402                 next = vma->vm_next;
 403                 if (vma->vm_start < start)
 404                         err = split_vma(vma->vm_mm, vma, start, 1);
 405                 if (!err && vma->vm_end > end)
 406                         err = split_vma(vma->vm_mm, vma, end, 0);
 407                 if (!err)
 408                         err = policy_vma(vma, new);
 409                 if (err)
 410                         break;
 411         }
 412         return err;
 413 }
 414
 415 static int contextualize_policy(int mode, nodemask_t *nodes)
 416 {
 417         if (!nodes)
 418                 return 0;
 419
 420         cpuset_update_task_memory_state();
 421         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 422                 return -EINVAL;
 423         return mpol_check_policy(mode, nodes);
 424 }
 425
 426
 427 /*
 428  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 429  * mempolicy.  Allows more rapid checking of this (combined perhaps
 430  * with other PF_* flag bits) on memory allocation hot code paths.
 431  *
 432  * If called from outside this file, the task 'p' should -only- be
 433  * a newly forked child not yet visible on the task list, because
 434  * manipulating the task flags of a visible task is not safe.
 435  *
 436  * The above limitation is why this routine has the funny name
 437  * mpol_fix_fork_child_flag().
 438  *
 439  * It is also safe to call this with a task pointer of current,
 440  * which the static wrapper mpol_set_task_struct_flag() does,
 441  * for use within this file.
 442  */
 443
 444 void mpol_fix_fork_child_flag(struct task_struct *p)
 445 {
 446         if (p->mempolicy)
 447                 p->flags |= PF_MEMPOLICY;
 448         else
 449                 p->flags &= ~PF_MEMPOLICY;
 450 }
 451
 452 static void mpol_set_task_struct_flag(void)
 453 {
 454         mpol_fix_fork_child_flag(current);
 455 }
 456
 457 /* Set the process memory policy */
 458 long do_set_mempolicy(int mode, nodemask_t *nodes)
 459 {
 460         struct mempolicy *new;
 461
 462         if (contextualize_policy(mode, nodes))
 463                 return -EINVAL;
 464         new = mpol_new(mode, nodes);
 465         if (IS_ERR(new))
 466                 return PTR_ERR(new);
 467         mpol_free(current->mempolicy);
 468         current->mempolicy = new;
 469         mpol_set_task_struct_flag();
 470         if (new && new->policy == MPOL_INTERLEAVE)
 471                 current->il_next = first_node(new->v.nodes);
 472         return 0;
 473 }
 474
 475 /* Fill a zone bitmap for a policy */
 476 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 477 {
 478         int i;
 479
 480         nodes_clear(*nodes);
 481         switch (p->policy) {
 482         case MPOL_BIND:
 483                 for (i = 0; p->v.zonelist->zones[i]; i++)
 484                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 485                                 *nodes);
 486                 break;
 487         case MPOL_DEFAULT:
 488                 break;
 489         case MPOL_INTERLEAVE:
 490                 *nodes = p->v.nodes;
 491                 break;
 492         case MPOL_PREFERRED:
 493                 /* or use current node instead of online map? */
 494                 if (p->v.preferred_node < 0)
 495                         *nodes = node_online_map;
 496                 else
 497                         node_set(p->v.preferred_node, *nodes);
 498                 break;
 499         default:
 500                 BUG();
 501         }
 502 }
 503
 504 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 505 {
 506         struct page *p;
 507         int err;
 508
 509         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 510         if (err >= 0) {
 511                 err = page_to_nid(p);
 512                 put_page(p);
 513         }
 514         return err;
 515 }
 516
 517 /* Retrieve NUMA policy */
 518 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 519                         unsigned long addr, unsigned long flags)
 520 {
 521         int err;
 522         struct mm_struct *mm = current->mm;
 523         struct vm_area_struct *vma = NULL;
 524         struct mempolicy *pol = current->mempolicy;
 525
 526         cpuset_update_task_memory_state();
 527         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 528                 return -EINVAL;
 529         if (flags & MPOL_F_ADDR) {
 530                 down_read(&mm->mmap_sem);
 531                 vma = find_vma_intersection(mm, addr, addr+1);
 532                 if (!vma) {
 533                         up_read(&mm->mmap_sem);
 534                         return -EFAULT;
 535                 }
 536                 if (vma->vm_ops && vma->vm_ops->get_policy)
 537                         pol = vma->vm_ops->get_policy(vma, addr);
 538                 else
 539                         pol = vma->vm_policy;
 540         } else if (addr)
 541                 return -EINVAL;
 542
 543         if (!pol)
 544                 pol = &default_policy;
 545
 546         if (flags & MPOL_F_NODE) {
 547                 if (flags & MPOL_F_ADDR) {
 548                         err = lookup_node(mm, addr);
 549                         if (err < 0)
 550                                 goto out;
 551                         *policy = err;
 552                 } else if (pol == current->mempolicy &&
 553                                 pol->policy == MPOL_INTERLEAVE) {
 554                         *policy = current->il_next;
 555                 } else {
 556                         err = -EINVAL;
 557                         goto out;
 558                 }
 559         } else
 560                 *policy = pol->policy;
 561
 562         if (vma) {
 563                 up_read(&current->mm->mmap_sem);
 564                 vma = NULL;
 565         }
 566
 567         err = 0;
 568         if (nmask)
 569                 get_zonemask(pol, nmask);
 570
 571  out:
 572         if (vma)
 573                 up_read(&current->mm->mmap_sem);
 574         return err;
 575 }
 576
 577 #ifdef CONFIG_MIGRATION
 578 /*
 579  * page migration
 580  */
 581 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 582                                 unsigned long flags)
 583 {
 584         /*
 585          * Avoid migrating a page that is shared with others.
 586          */
 587         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 588                 isolate_lru_page(page, pagelist);
 589 }
 590
 591 /*
 592  * Migrate pages from one node to a target node.
 593  * Returns error or the number of pages not migrated.
 594  */
 595 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 596 {
 597         nodemask_t nmask;
 598         LIST_HEAD(pagelist);
 599         int err = 0;
 600
 601         nodes_clear(nmask);
 602         node_set(source, nmask);
 603
 604         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 605                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 606
 607         if (!list_empty(&pagelist)) {
 608                 err = migrate_pages_to(&pagelist, NULL, dest);
 609                 if (!list_empty(&pagelist))
 610                         putback_lru_pages(&pagelist);
 611         }
 612         return err;
 613 }
 614
 615 /*
 616  * Move pages between the two nodesets so as to preserve the physical
 617  * layout as much as possible.
 618  *
 619  * Returns the number of page that could not be moved.
 620  */
 621 int do_migrate_pages(struct mm_struct *mm,
 622         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 623 {
 624         LIST_HEAD(pagelist);
 625         int busy = 0;
 626         int err = 0;
 627         nodemask_t tmp;
 628
 629         down_read(&mm->mmap_sem);
 630
 631 /*
 632  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 633  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 634  * bit in 'tmp', and return that <source, dest> pair for migration.
 635  * The pair of nodemasks 'to' and 'from' define the map.
 636  *
 637  * If no pair of bits is found that way, fallback to picking some
 638  * pair of 'source' and 'dest' bits that are not the same.  If the
 639  * 'source' and 'dest' bits are the same, this represents a node
 640  * that will be migrating to itself, so no pages need move.
 641  *
 642  * If no bits are left in 'tmp', or if all remaining bits left
 643  * in 'tmp' correspond to the same bit in 'to', return false
 644  * (nothing left to migrate).
 645  *
 646  * This lets us pick a pair of nodes to migrate between, such that
 647  * if possible the dest node is not already occupied by some other
 648  * source node, minimizing the risk of overloading the memory on a
 649  * node that would happen if we migrated incoming memory to a node
 650  * before migrating outgoing memory source that same node.
 651  *
 652  * A single scan of tmp is sufficient.  As we go, we remember the
 653  * most recent <s, d> pair that moved (s != d).  If we find a pair
 654  * that not only moved, but what's better, moved to an empty slot
 655  * (d is not set in tmp), then we break out then, with that pair.
 656  * Otherwise when we finish scannng from_tmp, we at least have the
 657  * most recent <s, d> pair that moved.  If we get all the way through
 658  * the scan of tmp without finding any node that moved, much less
 659  * moved to an empty node, then there is nothing left worth migrating.
 660  */
 661
 662         tmp = *from_nodes;
 663         while (!nodes_empty(tmp)) {
 664                 int s,d;
 665                 int source = -1;
 666                 int dest = 0;
 667
 668                 for_each_node_mask(s, tmp) {
 669                         d = node_remap(s, *from_nodes, *to_nodes);
 670                         if (s == d)
 671                                 continue;
 672
 673                         source = s;     /* Node moved. Memorize */
 674                         dest = d;
 675
 676                         /* dest not in remaining from nodes? */
 677                         if (!node_isset(dest, tmp))
 678                                 break;
 679                 }
 680                 if (source == -1)
 681                         break;
 682
 683                 node_clear(source, tmp);
 684                 err = migrate_to_node(mm, source, dest, flags);
 685                 if (err > 0)
 686                         busy += err;
 687                 if (err < 0)
 688                         break;
 689         }
 690
 691         up_read(&mm->mmap_sem);
 692         if (err < 0)
 693                 return err;
 694         return busy;
 695
 696 }
 697
 698 #else
 699
 700 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 701                                 unsigned long flags)
 702 {
 703 }
 704
 705 int do_migrate_pages(struct mm_struct *mm,
 706         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 707 {
 708         return -ENOSYS;
 709 }
 710 #endif
 711
 712 long do_mbind(unsigned long start, unsigned long len,
 713                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 714 {
 715         struct vm_area_struct *vma;
 716         struct mm_struct *mm = current->mm;
 717         struct mempolicy *new;
 718         unsigned long end;
 719         int err;
 720         LIST_HEAD(pagelist);
 721
 722         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 723                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 724             || mode > MPOL_MAX)
 725                 return -EINVAL;
 726         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 727                 return -EPERM;
 728
 729         if (start & ~PAGE_MASK)
 730                 return -EINVAL;
 731
 732         if (mode == MPOL_DEFAULT)
 733                 flags &= ~MPOL_MF_STRICT;
 734
 735         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 736         end = start + len;
 737
 738         if (end < start)
 739                 return -EINVAL;
 740         if (end == start)
 741                 return 0;
 742
 743         if (mpol_check_policy(mode, nmask))
 744                 return -EINVAL;
 745
 746         new = mpol_new(mode, nmask);
 747         if (IS_ERR(new))
 748                 return PTR_ERR(new);
 749
 750         /*
 751          * If we are using the default policy then operation
 752          * on discontinuous address spaces is okay after all
 753          */
 754         if (!new)
 755                 flags |= MPOL_MF_DISCONTIG_OK;
 756
 757         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 758                         mode,nodes_addr(nodes)[0]);
 759
 760         down_write(&mm->mmap_sem);
 761         vma = check_range(mm, start, end, nmask,
 762                           flags | MPOL_MF_INVERT, &pagelist);
 763
 764         err = PTR_ERR(vma);
 765         if (!IS_ERR(vma)) {
 766                 int nr_failed = 0;
 767
 768                 err = mbind_range(vma, start, end, new);
 769
 770                 if (!list_empty(&pagelist))
 771                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 772
 773                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 774                         err = -EIO;
 775         }
 776
 777         if (!list_empty(&pagelist))
 778                 putback_lru_pages(&pagelist);
 779
 780         up_write(&mm->mmap_sem);
 781         mpol_free(new);
 782         return err;
 783 }
 784
 785 /*
 786  * User space interface with variable sized bitmaps for nodelists.
 787  */
 788
 789 /* Copy a node mask from user space. */
 790 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 791                      unsigned long maxnode)
 792 {
 793         unsigned long k;
 794         unsigned long nlongs;
 795         unsigned long endmask;
 796
 797         --maxnode;
 798         nodes_clear(*nodes);
 799         if (maxnode == 0 || !nmask)
 800                 return 0;
 801         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 802                 return -EINVAL;
 803
 804         nlongs = BITS_TO_LONGS(maxnode);
 805         if ((maxnode % BITS_PER_LONG) == 0)
 806                 endmask = ~0UL;
 807         else
 808                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 809
 810         /* When the user specified more nodes than supported just check
 811            if the non supported part is all zero. */
 812         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 813                 if (nlongs > PAGE_SIZE/sizeof(long))
 814                         return -EINVAL;
 815                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 816                         unsigned long t;
 817                         if (get_user(t, nmask + k))
 818                                 return -EFAULT;
 819                         if (k == nlongs - 1) {
 820                                 if (t & endmask)
 821                                         return -EINVAL;
 822                         } else if (t)
 823                                 return -EINVAL;
 824                 }
 825                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 826                 endmask = ~0UL;
 827         }
 828
 829         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 830                 return -EFAULT;
 831         nodes_addr(*nodes)[nlongs-1] &= endmask;
 832         return 0;
 833 }
 834
 835 /* Copy a kernel node mask to user space */
 836 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 837                               nodemask_t *nodes)
 838 {
 839         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 840         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 841
 842         if (copy > nbytes) {
 843                 if (copy > PAGE_SIZE)
 844                         return -EINVAL;
 845                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 846                         return -EFAULT;
 847                 copy = nbytes;
 848         }
 849         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 850 }
 851
 852 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 853                         unsigned long mode,
 854                         unsigned long __user *nmask, unsigned long maxnode,
 855                         unsigned flags)
 856 {
 857         nodemask_t nodes;
 858         int err;
 859
 860         err = get_nodes(&nodes, nmask, maxnode);
 861         if (err)
 862                 return err;
 863         return do_mbind(start, len, mode, &nodes, flags);
 864 }
 865
 866 /* Set the process memory policy */
 867 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 868                 unsigned long maxnode)
 869 {
 870         int err;
 871         nodemask_t nodes;
 872
 873         if (mode < 0 || mode > MPOL_MAX)
 874                 return -EINVAL;
 875         err = get_nodes(&nodes, nmask, maxnode);
 876         if (err)
 877                 return err;
 878         return do_set_mempolicy(mode, &nodes);
 879 }
 880
 881 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 882                 const unsigned long __user *old_nodes,
 883                 const unsigned long __user *new_nodes)
 884 {
 885         struct mm_struct *mm;
 886         struct task_struct *task;
 887         nodemask_t old;
 888         nodemask_t new;
 889         nodemask_t task_nodes;
 890         int err;
 891
 892         err = get_nodes(&old, old_nodes, maxnode);
 893         if (err)
 894                 return err;
 895
 896         err = get_nodes(&new, new_nodes, maxnode);
 897         if (err)
 898                 return err;
 899
 900         /* Find the mm_struct */
 901         read_lock(&tasklist_lock);
 902         task = pid ? find_task_by_pid(pid) : current;
 903         if (!task) {
 904                 read_unlock(&tasklist_lock);
 905                 return -ESRCH;
 906         }
 907         mm = get_task_mm(task);
 908         read_unlock(&tasklist_lock);
 909
 910         if (!mm)
 911                 return -EINVAL;
 912
 913         /*
 914          * Check if this process has the right to modify the specified
 915          * process. The right exists if the process has administrative
 916          * capabilities, superuser privileges or the same
 917          * userid as the target process.
 918          */
 919         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 920             (current->uid != task->suid) && (current->uid != task->uid) &&
 921             !capable(CAP_SYS_NICE)) {
 922                 err = -EPERM;
 923                 goto out;
 924         }
 925
 926         task_nodes = cpuset_mems_allowed(task);
 927         /* Is the user allowed to access the target nodes? */
 928         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 929                 err = -EPERM;
 930                 goto out;
 931         }
 932
 933         err = do_migrate_pages(mm, &old, &new,
 934                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 935 out:
 936         mmput(mm);
 937         return err;
 938 }
 939
 940
 941 /* Retrieve NUMA policy */
 942 asmlinkage long sys_get_mempolicy(int __user *policy,
 943                                 unsigned long __user *nmask,
 944                                 unsigned long maxnode,
 945                                 unsigned long addr, unsigned long flags)
 946 {
 947         int err, pval;
 948         nodemask_t nodes;
 949
 950         if (nmask != NULL && maxnode < MAX_NUMNODES)
 951                 return -EINVAL;
 952
 953         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 954
 955         if (err)
 956                 return err;
 957
 958         if (policy && put_user(pval, policy))
 959                 return -EFAULT;
 960
 961         if (nmask)
 962                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 963
 964         return err;
 965 }
 966
 967 #ifdef CONFIG_COMPAT
 968
 969 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 970                                      compat_ulong_t __user *nmask,
 971                                      compat_ulong_t maxnode,
 972                                      compat_ulong_t addr, compat_ulong_t flags)
 973 {
 974         long err;
 975         unsigned long __user *nm = NULL;
 976         unsigned long nr_bits, alloc_size;
 977         DECLARE_BITMAP(bm, MAX_NUMNODES);
 978
 979         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 980         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 981
 982         if (nmask)
 983                 nm = compat_alloc_user_space(alloc_size);
 984
 985         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 986
 987         if (!err && nmask) {
 988                 err = copy_from_user(bm, nm, alloc_size);
 989                 /* ensure entire bitmap is zeroed */
 990                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 991                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 992         }
 993
 994         return err;
 995 }
 996
 997 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 998                                      compat_ulong_t maxnode)
 999 {
1000         long err = 0;
1001         unsigned long __user *nm = NULL;
1002         unsigned long nr_bits, alloc_size;
1003         DECLARE_BITMAP(bm, MAX_NUMNODES);
1004
1005         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1006         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1007
1008         if (nmask) {
1009                 err = compat_get_bitmap(bm, nmask, nr_bits);
1010                 nm = compat_alloc_user_space(alloc_size);
1011                 err |= copy_to_user(nm, bm, alloc_size);
1012         }
1013
1014         if (err)
1015                 return -EFAULT;
1016
1017         return sys_set_mempolicy(mode, nm, nr_bits+1);
1018 }
1019
1020 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1021                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1022                              compat_ulong_t maxnode, compat_ulong_t flags)
1023 {
1024         long err = 0;
1025         unsigned long __user *nm = NULL;
1026         unsigned long nr_bits, alloc_size;
1027         nodemask_t bm;
1028
1029         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1030         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1031
1032         if (nmask) {
1033                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1034                 nm = compat_alloc_user_space(alloc_size);
1035                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1036         }
1037
1038         if (err)
1039                 return -EFAULT;
1040
1041         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1042 }
1043
1044 #endif
1045
1046 /* Return effective policy for a VMA */
1047 static struct mempolicy * get_vma_policy(struct task_struct *task,
1048                 struct vm_area_struct *vma, unsigned long addr)
1049 {
1050         struct mempolicy *pol = task->mempolicy;
1051
1052         if (vma) {
1053                 if (vma->vm_ops && vma->vm_ops->get_policy)
1054                         pol = vma->vm_ops->get_policy(vma, addr);
1055                 else if (vma->vm_policy &&
1056                                 vma->vm_policy->policy != MPOL_DEFAULT)
1057                         pol = vma->vm_policy;
1058         }
1059         if (!pol)
1060                 pol = &default_policy;
1061         return pol;
1062 }
1063
1064 /* Return a zonelist representing a mempolicy */
1065 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1066 {
1067         int nd;
1068
1069         switch (policy->policy) {
1070         case MPOL_PREFERRED:
1071                 nd = policy->v.preferred_node;
1072                 if (nd < 0)
1073                         nd = numa_node_id();
1074                 break;
1075         case MPOL_BIND:
1076                 /* Lower zones don't get a policy applied */
1077                 /* Careful: current->mems_allowed might have moved */
1078                 if (gfp_zone(gfp) >= policy_zone)
1079                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1080                                 return policy->v.zonelist;
1081                 /*FALL THROUGH*/
1082         case MPOL_INTERLEAVE: /* should not happen */
1083         case MPOL_DEFAULT:
1084                 nd = numa_node_id();
1085                 break;
1086         default:
1087                 nd = 0;
1088                 BUG();
1089         }
1090         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1091 }
1092
1093 /* Do dynamic interleaving for a process */
1094 static unsigned interleave_nodes(struct mempolicy *policy)
1095 {
1096         unsigned nid, next;
1097         struct task_struct *me = current;
1098
1099         nid = me->il_next;
1100         next = next_node(nid, policy->v.nodes);
1101         if (next >= MAX_NUMNODES)
1102                 next = first_node(policy->v.nodes);
1103         me->il_next = next;
1104         return nid;
1105 }
1106
1107 /*
1108  * Depending on the memory policy provide a node from which to allocate the
1109  * next slab entry.
1110  */
1111 unsigned slab_node(struct mempolicy *policy)
1112 {
1113         switch (policy->policy) {
1114         case MPOL_INTERLEAVE:
1115                 return interleave_nodes(policy);
1116
1117         case MPOL_BIND:
1118                 /*
1119                  * Follow bind policy behavior and start allocation at the
1120                  * first node.
1121                  */
1122                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1123
1124         case MPOL_PREFERRED:
1125                 if (policy->v.preferred_node >= 0)
1126                         return policy->v.preferred_node;
1127                 /* Fall through */
1128
1129         default:
1130                 return numa_node_id();
1131         }
1132 }
1133
1134 /* Do static interleaving for a VMA with known offset. */
1135 static unsigned offset_il_node(struct mempolicy *pol,
1136                 struct vm_area_struct *vma, unsigned long off)
1137 {
1138         unsigned nnodes = nodes_weight(pol->v.nodes);
1139         unsigned target = (unsigned)off % nnodes;
1140         int c;
1141         int nid = -1;
1142
1143         c = 0;
1144         do {
1145                 nid = next_node(nid, pol->v.nodes);
1146                 c++;
1147         } while (c <= target);
1148         return nid;
1149 }
1150
1151 /* Determine a node number for interleave */
1152 static inline unsigned interleave_nid(struct mempolicy *pol,
1153                  struct vm_area_struct *vma, unsigned long addr, int shift)
1154 {
1155         if (vma) {
1156                 unsigned long off;
1157
1158                 off = vma->vm_pgoff;
1159                 off += (addr - vma->vm_start) >> shift;
1160                 return offset_il_node(pol, vma, off);
1161         } else
1162                 return interleave_nodes(pol);
1163 }
1164
1165 #ifdef CONFIG_HUGETLBFS
1166 /* Return a zonelist suitable for a huge page allocation. */
1167 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1168 {
1169         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1170
1171         if (pol->policy == MPOL_INTERLEAVE) {
1172                 unsigned nid;
1173
1174                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1175                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1176         }
1177         return zonelist_policy(GFP_HIGHUSER, pol);
1178 }
1179 #endif
1180
1181 /* Allocate a page in interleaved policy.
1182    Own path because it needs to do special accounting. */
1183 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1184                                         unsigned nid)
1185 {
1186         struct zonelist *zl;
1187         struct page *page;
1188
1189         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1190         page = __alloc_pages(gfp, order, zl);
1191         if (page && page_zone(page) == zl->zones[0]) {
1192                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1193                 put_cpu();
1194         }
1195         return page;
1196 }
1197
1198 /**
1199  *      alloc_page_vma  - Allocate a page for a VMA.
1200  *
1201  *      @gfp:
1202  *      %GFP_USER    user allocation.
1203  *      %GFP_KERNEL  kernel allocations,
1204  *      %GFP_HIGHMEM highmem/user allocations,
1205  *      %GFP_FS      allocation should not call back into a file system.
1206  *      %GFP_ATOMIC  don't sleep.
1207  *
1208  *      @vma:  Pointer to VMA or NULL if not available.
1209  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1210  *
1211  *      This function allocates a page from the kernel page pool and applies
1212  *      a NUMA policy associated with the VMA or the current process.
1213  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1214  *      mm_struct of the VMA to prevent it from going away. Should be used for
1215  *      all allocations for pages that will be mapped into
1216  *      user space. Returns NULL when no page can be allocated.
1217  *
1218  *      Should be called with the mm_sem of the vma hold.
1219  */
1220 struct page *
1221 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1222 {
1223         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1224
1225         cpuset_update_task_memory_state();
1226
1227         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1228                 unsigned nid;
1229
1230                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1231                 return alloc_page_interleave(gfp, 0, nid);
1232         }
1233         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1234 }
1235
1236 /**
1237  *      alloc_pages_current - Allocate pages.
1238  *
1239  *      @gfp:
1240  *              %GFP_USER   user allocation,
1241  *              %GFP_KERNEL kernel allocation,
1242  *              %GFP_HIGHMEM highmem allocation,
1243  *              %GFP_FS     don't call back into a file system.
1244  *              %GFP_ATOMIC don't sleep.
1245  *      @order: Power of two of allocation size in pages. 0 is a single page.
1246  *
1247  *      Allocate a page from the kernel page pool.  When not in
1248  *      interrupt context and apply the current process NUMA policy.
1249  *      Returns NULL when no page can be allocated.
1250  *
1251  *      Don't call cpuset_update_task_memory_state() unless
1252  *      1) it's ok to take cpuset_sem (can WAIT), and
1253  *      2) allocating for current task (not interrupt).
1254  */
1255 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1256 {
1257         struct mempolicy *pol = current->mempolicy;
1258
1259         if ((gfp & __GFP_WAIT) && !in_interrupt())
1260                 cpuset_update_task_memory_state();
1261         if (!pol || in_interrupt())
1262                 pol = &default_policy;
1263         if (pol->policy == MPOL_INTERLEAVE)
1264                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1265         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1266 }
1267 EXPORT_SYMBOL(alloc_pages_current);
1268
1269 /*
1270  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1271  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1272  * with the mems_allowed returned by cpuset_mems_allowed().  This
1273  * keeps mempolicies cpuset relative after its cpuset moves.  See
1274  * further kernel/cpuset.c update_nodemask().
1275  */
1276 void *cpuset_being_rebound;
1277
1278 /* Slow path of a mempolicy copy */
1279 struct mempolicy *__mpol_copy(struct mempolicy *old)
1280 {
1281         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1282
1283         if (!new)
1284                 return ERR_PTR(-ENOMEM);
1285         if (current_cpuset_is_being_rebound()) {
1286                 nodemask_t mems = cpuset_mems_allowed(current);
1287                 mpol_rebind_policy(old, &mems);
1288         }
1289         *new = *old;
1290         atomic_set(&new->refcnt, 1);
1291         if (new->policy == MPOL_BIND) {
1292                 int sz = ksize(old->v.zonelist);
1293                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1294                 if (!new->v.zonelist) {
1295                         kmem_cache_free(policy_cache, new);
1296                         return ERR_PTR(-ENOMEM);
1297                 }
1298                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1299         }
1300         return new;
1301 }
1302
1303 /* Slow path of a mempolicy comparison */
1304 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1305 {
1306         if (!a || !b)
1307                 return 0;
1308         if (a->policy != b->policy)
1309                 return 0;
1310         switch (a->policy) {
1311         case MPOL_DEFAULT:
1312                 return 1;
1313         case MPOL_INTERLEAVE:
1314                 return nodes_equal(a->v.nodes, b->v.nodes);
1315         case MPOL_PREFERRED:
1316                 return a->v.preferred_node == b->v.preferred_node;
1317         case MPOL_BIND: {
1318                 int i;
1319                 for (i = 0; a->v.zonelist->zones[i]; i++)
1320                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1321                                 return 0;
1322                 return b->v.zonelist->zones[i] == NULL;
1323         }
1324         default:
1325                 BUG();
1326                 return 0;
1327         }
1328 }
1329
1330 /* Slow path of a mpol destructor. */
1331 void __mpol_free(struct mempolicy *p)
1332 {
1333         if (!atomic_dec_and_test(&p->refcnt))
1334                 return;
1335         if (p->policy == MPOL_BIND)
1336                 kfree(p->v.zonelist);
1337         p->policy = MPOL_DEFAULT;
1338         kmem_cache_free(policy_cache, p);
1339 }
1340
1341 /*
1342  * Shared memory backing store policy support.
1343  *
1344  * Remember policies even when nobody has shared memory mapped.
1345  * The policies are kept in Red-Black tree linked from the inode.
1346  * They are protected by the sp->lock spinlock, which should be held
1347  * for any accesses to the tree.
1348  */
1349
1350 /* lookup first element intersecting start-end */
1351 /* Caller holds sp->lock */
1352 static struct sp_node *
1353 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1354 {
1355         struct rb_node *n = sp->root.rb_node;
1356
1357         while (n) {
1358                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1359
1360                 if (start >= p->end)
1361                         n = n->rb_right;
1362                 else if (end <= p->start)
1363                         n = n->rb_left;
1364                 else
1365                         break;
1366         }
1367         if (!n)
1368                 return NULL;
1369         for (;;) {
1370                 struct sp_node *w = NULL;
1371                 struct rb_node *prev = rb_prev(n);
1372                 if (!prev)
1373                         break;
1374                 w = rb_entry(prev, struct sp_node, nd);
1375                 if (w->end <= start)
1376                         break;
1377                 n = prev;
1378         }
1379         return rb_entry(n, struct sp_node, nd);
1380 }
1381
1382 /* Insert a new shared policy into the list. */
1383 /* Caller holds sp->lock */
1384 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1385 {
1386         struct rb_node **p = &sp->root.rb_node;
1387         struct rb_node *parent = NULL;
1388         struct sp_node *nd;
1389
1390         while (*p) {
1391                 parent = *p;
1392                 nd = rb_entry(parent, struct sp_node, nd);
1393                 if (new->start < nd->start)
1394                         p = &(*p)->rb_left;
1395                 else if (new->end > nd->end)
1396                         p = &(*p)->rb_right;
1397                 else
1398                         BUG();
1399         }
1400         rb_link_node(&new->nd, parent, p);
1401         rb_insert_color(&new->nd, &sp->root);
1402         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1403                  new->policy ? new->policy->policy : 0);
1404 }
1405
1406 /* Find shared policy intersecting idx */
1407 struct mempolicy *
1408 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1409 {
1410         struct mempolicy *pol = NULL;
1411         struct sp_node *sn;
1412
1413         if (!sp->root.rb_node)
1414                 return NULL;
1415         spin_lock(&sp->lock);
1416         sn = sp_lookup(sp, idx, idx+1);
1417         if (sn) {
1418                 mpol_get(sn->policy);
1419                 pol = sn->policy;
1420         }
1421         spin_unlock(&sp->lock);
1422         return pol;
1423 }
1424
1425 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1426 {
1427         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1428         rb_erase(&n->nd, &sp->root);
1429         mpol_free(n->policy);
1430         kmem_cache_free(sn_cache, n);
1431 }
1432
1433 struct sp_node *
1434 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1435 {
1436         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1437
1438         if (!n)
1439                 return NULL;
1440         n->start = start;
1441         n->end = end;
1442         mpol_get(pol);
1443         n->policy = pol;
1444         return n;
1445 }
1446
1447 /* Replace a policy range. */
1448 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1449                                  unsigned long end, struct sp_node *new)
1450 {
1451         struct sp_node *n, *new2 = NULL;
1452
1453 restart:
1454         spin_lock(&sp->lock);
1455         n = sp_lookup(sp, start, end);
1456         /* Take care of old policies in the same range. */
1457         while (n && n->start < end) {
1458                 struct rb_node *next = rb_next(&n->nd);
1459                 if (n->start >= start) {
1460                         if (n->end <= end)
1461                                 sp_delete(sp, n);
1462                         else
1463                                 n->start = end;
1464                 } else {
1465                         /* Old policy spanning whole new range. */
1466                         if (n->end > end) {
1467                                 if (!new2) {
1468                                         spin_unlock(&sp->lock);
1469                                         new2 = sp_alloc(end, n->end, n->policy);
1470                                         if (!new2)
1471                                                 return -ENOMEM;
1472                                         goto restart;
1473                                 }
1474                                 n->end = start;
1475                                 sp_insert(sp, new2);
1476                                 new2 = NULL;
1477                                 break;
1478                         } else
1479                                 n->end = start;
1480                 }
1481                 if (!next)
1482                         break;
1483                 n = rb_entry(next, struct sp_node, nd);
1484         }
1485         if (new)
1486                 sp_insert(sp, new);
1487         spin_unlock(&sp->lock);
1488         if (new2) {
1489                 mpol_free(new2->policy);
1490                 kmem_cache_free(sn_cache, new2);
1491         }
1492         return 0;
1493 }
1494
1495 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1496                                 nodemask_t *policy_nodes)
1497 {
1498         info->root = RB_ROOT;
1499         spin_lock_init(&info->lock);
1500
1501         if (policy != MPOL_DEFAULT) {
1502                 struct mempolicy *newpol;
1503
1504                 /* Falls back to MPOL_DEFAULT on any error */
1505                 newpol = mpol_new(policy, policy_nodes);
1506                 if (!IS_ERR(newpol)) {
1507                         /* Create pseudo-vma that contains just the policy */
1508                         struct vm_area_struct pvma;
1509
1510                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1511                         /* Policy covers entire file */
1512                         pvma.vm_end = TASK_SIZE;
1513                         mpol_set_shared_policy(info, &pvma, newpol);
1514                         mpol_free(newpol);
1515                 }
1516         }
1517 }
1518
1519 int mpol_set_shared_policy(struct shared_policy *info,
1520                         struct vm_area_struct *vma, struct mempolicy *npol)
1521 {
1522         int err;
1523         struct sp_node *new = NULL;
1524         unsigned long sz = vma_pages(vma);
1525
1526         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1527                  vma->vm_pgoff,
1528                  sz, npol? npol->policy : -1,
1529                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1530
1531         if (npol) {
1532                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1533                 if (!new)
1534                         return -ENOMEM;
1535         }
1536         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1537         if (err && new)
1538                 kmem_cache_free(sn_cache, new);
1539         return err;
1540 }
1541
1542 /* Free a backing policy store on inode delete. */
1543 void mpol_free_shared_policy(struct shared_policy *p)
1544 {
1545         struct sp_node *n;
1546         struct rb_node *next;
1547
1548         if (!p->root.rb_node)
1549                 return;
1550         spin_lock(&p->lock);
1551         next = rb_first(&p->root);
1552         while (next) {
1553                 n = rb_entry(next, struct sp_node, nd);
1554                 next = rb_next(&n->nd);
1555                 rb_erase(&n->nd, &p->root);
1556                 mpol_free(n->policy);
1557                 kmem_cache_free(sn_cache, n);
1558         }
1559         spin_unlock(&p->lock);
1560 }
1561
1562 /* assumes fs == KERNEL_DS */
1563 void __init numa_policy_init(void)
1564 {
1565         policy_cache = kmem_cache_create("numa_policy",
1566                                          sizeof(struct mempolicy),
1567                                          0, SLAB_PANIC, NULL, NULL);
1568
1569         sn_cache = kmem_cache_create("shared_policy_node",
1570                                      sizeof(struct sp_node),
1571                                      0, SLAB_PANIC, NULL, NULL);
1572
1573         /* Set interleaving policy for system init. This way not all
1574            the data structures allocated at system boot end up in node zero. */
1575
1576         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1577                 printk("numa_policy_init: interleaving failed\n");
1578 }
1579
1580 /* Reset policy of current process to default */
1581 void numa_default_policy(void)
1582 {
1583         do_set_mempolicy(MPOL_DEFAULT, NULL);
1584 }
1585
1586 /* Migrate a policy to a different set of nodes */
1587 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1588 {
1589         nodemask_t *mpolmask;
1590         nodemask_t tmp;
1591
1592         if (!pol)
1593                 return;
1594         mpolmask = &pol->cpuset_mems_allowed;
1595         if (nodes_equal(*mpolmask, *newmask))
1596                 return;
1597
1598         switch (pol->policy) {
1599         case MPOL_DEFAULT:
1600                 break;
1601         case MPOL_INTERLEAVE:
1602                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1603                 pol->v.nodes = tmp;
1604                 *mpolmask = *newmask;
1605                 current->il_next = node_remap(current->il_next,
1606                                                 *mpolmask, *newmask);
1607                 break;
1608         case MPOL_PREFERRED:
1609                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1610                                                 *mpolmask, *newmask);
1611                 *mpolmask = *newmask;
1612                 break;
1613         case MPOL_BIND: {
1614                 nodemask_t nodes;
1615                 struct zone **z;
1616                 struct zonelist *zonelist;
1617
1618                 nodes_clear(nodes);
1619                 for (z = pol->v.zonelist->zones; *z; z++)
1620                         node_set((*z)->zone_pgdat->node_id, nodes);
1621                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1622                 nodes = tmp;
1623
1624                 zonelist = bind_zonelist(&nodes);
1625
1626                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1627                  * If that old zonelist has no remaining mems_allowed nodes,
1628                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1629                  */
1630
1631                 if (zonelist) {
1632                         /* Good - got mem - substitute new zonelist */
1633                         kfree(pol->v.zonelist);
1634                         pol->v.zonelist = zonelist;
1635                 }
1636                 *mpolmask = *newmask;
1637                 break;
1638         }
1639         default:
1640                 BUG();
1641                 break;
1642         }
1643 }
1644
1645 /*
1646  * Wrapper for mpol_rebind_policy() that just requires task
1647  * pointer, and updates task mempolicy.
1648  */
1649
1650 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1651 {
1652         mpol_rebind_policy(tsk->mempolicy, new);
1653 }
1654
1655 /*
1656  * Rebind each vma in mm to new nodemask.
1657  *
1658  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1659  */
1660
1661 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1662 {
1663         struct vm_area_struct *vma;
1664
1665         down_write(&mm->mmap_sem);
1666         for (vma = mm->mmap; vma; vma = vma->vm_next)
1667                 mpol_rebind_policy(vma->vm_policy, new);
1668         up_write(&mm->mmap_sem);
1669 }
1670
1671 /*
1672  * Display pages allocated per node and memory policy via /proc.
1673  */
1674
1675 static const char *policy_types[] = { "default", "prefer", "bind",
1676                                       "interleave" };
1677
1678 /*
1679  * Convert a mempolicy into a string.
1680  * Returns the number of characters in buffer (if positive)
1681  * or an error (negative)
1682  */
1683 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1684 {
1685         char *p = buffer;
1686         int l;
1687         nodemask_t nodes;
1688         int mode = pol ? pol->policy : MPOL_DEFAULT;
1689
1690         switch (mode) {
1691         case MPOL_DEFAULT:
1692                 nodes_clear(nodes);
1693                 break;
1694
1695         case MPOL_PREFERRED:
1696                 nodes_clear(nodes);
1697                 node_set(pol->v.preferred_node, nodes);
1698                 break;
1699
1700         case MPOL_BIND:
1701                 get_zonemask(pol, &nodes);
1702                 break;
1703
1704         case MPOL_INTERLEAVE:
1705                 nodes = pol->v.nodes;
1706                 break;
1707
1708         default:
1709                 BUG();
1710                 return -EFAULT;
1711         }
1712
1713         l = strlen(policy_types[mode]);
1714         if (buffer + maxlen < p + l + 1)
1715                 return -ENOSPC;
1716
1717         strcpy(p, policy_types[mode]);
1718         p += l;
1719
1720         if (!nodes_empty(nodes)) {
1721                 if (buffer + maxlen < p + 2)
1722                         return -ENOSPC;
1723                 *p++ = '=';
1724                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1725         }
1726         return p - buffer;
1727 }
1728
1729 struct numa_maps {
1730         unsigned long pages;
1731         unsigned long anon;
1732         unsigned long active;
1733         unsigned long writeback;
1734         unsigned long mapcount_max;
1735         unsigned long dirty;
1736         unsigned long swapcache;
1737         unsigned long node[MAX_NUMNODES];
1738 };
1739
1740 static void gather_stats(struct page *page, void *private, int pte_dirty)
1741 {
1742         struct numa_maps *md = private;
1743         int count = page_mapcount(page);
1744
1745         md->pages++;
1746         if (pte_dirty || PageDirty(page))
1747                 md->dirty++;
1748
1749         if (PageSwapCache(page))
1750                 md->swapcache++;
1751
1752         if (PageActive(page))
1753                 md->active++;
1754
1755         if (PageWriteback(page))
1756                 md->writeback++;
1757
1758         if (PageAnon(page))
1759                 md->anon++;
1760
1761         if (count > md->mapcount_max)
1762                 md->mapcount_max = count;
1763
1764         md->node[page_to_nid(page)]++;
1765 }
1766
1767 #ifdef CONFIG_HUGETLB_PAGE
1768 static void check_huge_range(struct vm_area_struct *vma,
1769                 unsigned long start, unsigned long end,
1770                 struct numa_maps *md)
1771 {
1772         unsigned long addr;
1773         struct page *page;
1774
1775         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1776                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1777                 pte_t pte;
1778
1779                 if (!ptep)
1780                         continue;
1781
1782                 pte = *ptep;
1783                 if (pte_none(pte))
1784                         continue;
1785
1786                 page = pte_page(pte);
1787                 if (!page)
1788                         continue;
1789
1790                 gather_stats(page, md, pte_dirty(*ptep));
1791         }
1792 }
1793 #else
1794 static inline void check_huge_range(struct vm_area_struct *vma,
1795                 unsigned long start, unsigned long end,
1796                 struct numa_maps *md)
1797 {
1798 }
1799 #endif
1800
1801 int show_numa_map(struct seq_file *m, void *v)
1802 {
1803         struct task_struct *task = m->private;
1804         struct vm_area_struct *vma = v;
1805         struct numa_maps *md;
1806         struct file *file = vma->vm_file;
1807         struct mm_struct *mm = vma->vm_mm;
1808         int n;
1809         char buffer[50];
1810
1811         if (!mm)
1812                 return 0;
1813
1814         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1815         if (!md)
1816                 return 0;
1817
1818         mpol_to_str(buffer, sizeof(buffer),
1819                         get_vma_policy(task, vma, vma->vm_start));
1820
1821         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1822
1823         if (file) {
1824                 seq_printf(m, " file=");
1825                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1826         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1827                 seq_printf(m, " heap");
1828         } else if (vma->vm_start <= mm->start_stack &&
1829                         vma->vm_end >= mm->start_stack) {
1830                 seq_printf(m, " stack");
1831         }
1832
1833         if (is_vm_hugetlb_page(vma)) {
1834                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1835                 seq_printf(m, " huge");
1836         } else {
1837                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1838                                 &node_online_map, MPOL_MF_STATS, md);
1839         }
1840
1841         if (!md->pages)
1842                 goto out;
1843
1844         if (md->anon)
1845                 seq_printf(m," anon=%lu",md->anon);
1846
1847         if (md->dirty)
1848                 seq_printf(m," dirty=%lu",md->dirty);
1849
1850         if (md->pages != md->anon && md->pages != md->dirty)
1851                 seq_printf(m, " mapped=%lu", md->pages);
1852
1853         if (md->mapcount_max > 1)
1854                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1855
1856         if (md->swapcache)
1857                 seq_printf(m," swapcache=%lu", md->swapcache);
1858
1859         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1860                 seq_printf(m," active=%lu", md->active);
1861
1862         if (md->writeback)
1863                 seq_printf(m," writeback=%lu", md->writeback);
1864
1865         for_each_online_node(n)
1866                 if (md->node[n])
1867                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1868 out:
1869         seq_putc(m, '\n');
1870         kfree(md);
1871
1872         if (m->count < m->size)
1873                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1874         return 0;
1875 }
1876