mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/vs_cvirt.h>
  90
  91 #include <asm/tlbflush.h>
  92 #include <asm/uaccess.h>
  93
  94 /* Internal flags */
  95 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  96 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  97 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  98
  99 /* The number of pages to migrate per call to migrate_pages() */
 100 #define MIGRATE_CHUNK_SIZE 256
 101
 102 static kmem_cache_t *policy_cache;
 103 static kmem_cache_t *sn_cache;
 104
 105 #define PDprintk(fmt...)
 106
 107 /* Highest zone. An specific allocation for a zone below that is not
 108    policied. */
 109 int policy_zone = ZONE_DMA;
 110
 111 struct mempolicy default_policy = {
 112         .refcnt = ATOMIC_INIT(1), /* never free it */
 113         .policy = MPOL_DEFAULT,
 114 };
 115
 116 /* Do sanity checking on a policy */
 117 static int mpol_check_policy(int mode, nodemask_t *nodes)
 118 {
 119         int empty = nodes_empty(*nodes);
 120
 121         switch (mode) {
 122         case MPOL_DEFAULT:
 123                 if (!empty)
 124                         return -EINVAL;
 125                 break;
 126         case MPOL_BIND:
 127         case MPOL_INTERLEAVE:
 128                 /* Preferred will only use the first bit, but allow
 129                    more for now. */
 130                 if (empty)
 131                         return -EINVAL;
 132                 break;
 133         }
 134         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 135 }
 136
 137 /* Generate a custom zonelist for the BIND policy. */
 138 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 139 {
 140         struct zonelist *zl;
 141         int num, max, nd, k;
 142
 143         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 144         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 145         if (!zl)
 146                 return NULL;
 147         num = 0;
 148         /* First put in the highest zones from all nodes, then all the next
 149            lower zones etc. Avoid empty zones because the memory allocator
 150            doesn't like them. If you implement node hot removal you
 151            have to fix that. */
 152         for (k = policy_zone; k >= 0; k--) {
 153                 for_each_node_mask(nd, *nodes) {
 154                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 155                         if (z->present_pages > 0)
 156                                 zl->zones[num++] = z;
 157                 }
 158         }
 159         zl->zones[num] = NULL;
 160         return zl;
 161 }
 162
 163 /* Create a new policy */
 164 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 165 {
 166         struct mempolicy *policy;
 167
 168         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 169         if (mode == MPOL_DEFAULT)
 170                 return NULL;
 171         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 172         if (!policy)
 173                 return ERR_PTR(-ENOMEM);
 174         atomic_set(&policy->refcnt, 1);
 175         switch (mode) {
 176         case MPOL_INTERLEAVE:
 177                 policy->v.nodes = *nodes;
 178                 if (nodes_weight(*nodes) == 0) {
 179                         kmem_cache_free(policy_cache, policy);
 180                         return ERR_PTR(-EINVAL);
 181                 }
 182                 break;
 183         case MPOL_PREFERRED:
 184                 policy->v.preferred_node = first_node(*nodes);
 185                 if (policy->v.preferred_node >= MAX_NUMNODES)
 186                         policy->v.preferred_node = -1;
 187                 break;
 188         case MPOL_BIND:
 189                 policy->v.zonelist = bind_zonelist(nodes);
 190                 if (policy->v.zonelist == NULL) {
 191                         kmem_cache_free(policy_cache, policy);
 192                         return ERR_PTR(-ENOMEM);
 193                 }
 194                 break;
 195         }
 196         policy->policy = mode;
 197         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 198         return policy;
 199 }
 200
 201 static void gather_stats(struct page *, void *, int pte_dirty);
 202 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 203                                 unsigned long flags);
 204
 205 /* Scan through pages checking if pages follow certain conditions. */
 206 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 207                 unsigned long addr, unsigned long end,
 208                 const nodemask_t *nodes, unsigned long flags,
 209                 void *private)
 210 {
 211         pte_t *orig_pte;
 212         pte_t *pte;
 213         spinlock_t *ptl;
 214
 215         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 216         do {
 217                 struct page *page;
 218                 unsigned int nid;
 219
 220                 if (!pte_present(*pte))
 221                         continue;
 222                 page = vm_normal_page(vma, addr, *pte);
 223                 if (!page)
 224                         continue;
 225                 /*
 226                  * The check for PageReserved here is important to avoid
 227                  * handling zero pages and other pages that may have been
 228                  * marked special by the system.
 229                  *
 230                  * If the PageReserved would not be checked here then f.e.
 231                  * the location of the zero page could have an influence
 232                  * on MPOL_MF_STRICT, zero pages would be counted for
 233                  * the per node stats, and there would be useless attempts
 234                  * to put zero pages on the migration list.
 235                  */
 236                 if (PageReserved(page))
 237                         continue;
 238                 nid = page_to_nid(page);
 239                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 240                         continue;
 241
 242                 if (flags & MPOL_MF_STATS)
 243                         gather_stats(page, private, pte_dirty(*pte));
 244                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 245                         migrate_page_add(page, private, flags);
 246                 else
 247                         break;
 248         } while (pte++, addr += PAGE_SIZE, addr != end);
 249         pte_unmap_unlock(orig_pte, ptl);
 250         return addr != end;
 251 }
 252
 253 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 254                 unsigned long addr, unsigned long end,
 255                 const nodemask_t *nodes, unsigned long flags,
 256                 void *private)
 257 {
 258         pmd_t *pmd;
 259         unsigned long next;
 260
 261         pmd = pmd_offset(pud, addr);
 262         do {
 263                 next = pmd_addr_end(addr, end);
 264                 if (pmd_none_or_clear_bad(pmd))
 265                         continue;
 266                 if (check_pte_range(vma, pmd, addr, next, nodes,
 267                                     flags, private))
 268                         return -EIO;
 269         } while (pmd++, addr = next, addr != end);
 270         return 0;
 271 }
 272
 273 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 274                 unsigned long addr, unsigned long end,
 275                 const nodemask_t *nodes, unsigned long flags,
 276                 void *private)
 277 {
 278         pud_t *pud;
 279         unsigned long next;
 280
 281         pud = pud_offset(pgd, addr);
 282         do {
 283                 next = pud_addr_end(addr, end);
 284                 if (pud_none_or_clear_bad(pud))
 285                         continue;
 286                 if (check_pmd_range(vma, pud, addr, next, nodes,
 287                                     flags, private))
 288                         return -EIO;
 289         } while (pud++, addr = next, addr != end);
 290         return 0;
 291 }
 292
 293 static inline int check_pgd_range(struct vm_area_struct *vma,
 294                 unsigned long addr, unsigned long end,
 295                 const nodemask_t *nodes, unsigned long flags,
 296                 void *private)
 297 {
 298         pgd_t *pgd;
 299         unsigned long next;
 300
 301         pgd = pgd_offset(vma->vm_mm, addr);
 302         do {
 303                 next = pgd_addr_end(addr, end);
 304                 if (pgd_none_or_clear_bad(pgd))
 305                         continue;
 306                 if (check_pud_range(vma, pgd, addr, next, nodes,
 307                                     flags, private))
 308                         return -EIO;
 309         } while (pgd++, addr = next, addr != end);
 310         return 0;
 311 }
 312
 313 /* Check if a vma is migratable */
 314 static inline int vma_migratable(struct vm_area_struct *vma)
 315 {
 316         if (vma->vm_flags & (
 317                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 318                 return 0;
 319         return 1;
 320 }
 321
 322 /*
 323  * Check if all pages in a range are on a set of nodes.
 324  * If pagelist != NULL then isolate pages from the LRU and
 325  * put them on the pagelist.
 326  */
 327 static struct vm_area_struct *
 328 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 329                 const nodemask_t *nodes, unsigned long flags, void *private)
 330 {
 331         int err;
 332         struct vm_area_struct *first, *vma, *prev;
 333
 334         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 335                 /* Must have swap device for migration */
 336                 if (nr_swap_pages <= 0)
 337                         return ERR_PTR(-ENODEV);
 338
 339                 /*
 340                  * Clear the LRU lists so pages can be isolated.
 341                  * Note that pages may be moved off the LRU after we have
 342                  * drained them. Those pages will fail to migrate like other
 343                  * pages that may be busy.
 344                  */
 345                 lru_add_drain_all();
 346         }
 347
 348         first = find_vma(mm, start);
 349         if (!first)
 350                 return ERR_PTR(-EFAULT);
 351         prev = NULL;
 352         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 353                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 354                         if (!vma->vm_next && vma->vm_end < end)
 355                                 return ERR_PTR(-EFAULT);
 356                         if (prev && prev->vm_end < vma->vm_start)
 357                                 return ERR_PTR(-EFAULT);
 358                 }
 359                 if (!is_vm_hugetlb_page(vma) &&
 360                     ((flags & MPOL_MF_STRICT) ||
 361                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 362                                 vma_migratable(vma)))) {
 363                         unsigned long endvma = vma->vm_end;
 364
 365                         if (endvma > end)
 366                                 endvma = end;
 367                         if (vma->vm_start > start)
 368                                 start = vma->vm_start;
 369                         err = check_pgd_range(vma, start, endvma, nodes,
 370                                                 flags, private);
 371                         if (err) {
 372                                 first = ERR_PTR(err);
 373                                 break;
 374                         }
 375                 }
 376                 prev = vma;
 377         }
 378         return first;
 379 }
 380
 381 /* Apply policy to a single VMA */
 382 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 383 {
 384         int err = 0;
 385         struct mempolicy *old = vma->vm_policy;
 386
 387         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 388                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 389                  vma->vm_ops, vma->vm_file,
 390                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 391
 392         if (vma->vm_ops && vma->vm_ops->set_policy)
 393                 err = vma->vm_ops->set_policy(vma, new);
 394         if (!err) {
 395                 mpol_get(new);
 396                 vma->vm_policy = new;
 397                 mpol_free(old);
 398         }
 399         return err;
 400 }
 401
 402 /* Step 2: apply policy to a range and do splits. */
 403 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 404                        unsigned long end, struct mempolicy *new)
 405 {
 406         struct vm_area_struct *next;
 407         int err;
 408
 409         err = 0;
 410         for (; vma && vma->vm_start < end; vma = next) {
 411                 next = vma->vm_next;
 412                 if (vma->vm_start < start)
 413                         err = split_vma(vma->vm_mm, vma, start, 1);
 414                 if (!err && vma->vm_end > end)
 415                         err = split_vma(vma->vm_mm, vma, end, 0);
 416                 if (!err)
 417                         err = policy_vma(vma, new);
 418                 if (err)
 419                         break;
 420         }
 421         return err;
 422 }
 423
 424 static int contextualize_policy(int mode, nodemask_t *nodes)
 425 {
 426         if (!nodes)
 427                 return 0;
 428
 429         cpuset_update_task_memory_state();
 430         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 431                 return -EINVAL;
 432         return mpol_check_policy(mode, nodes);
 433 }
 434
 435 /* Set the process memory policy */
 436 long do_set_mempolicy(int mode, nodemask_t *nodes)
 437 {
 438         struct mempolicy *new;
 439
 440         if (contextualize_policy(mode, nodes))
 441                 return -EINVAL;
 442         new = mpol_new(mode, nodes);
 443         if (IS_ERR(new))
 444                 return PTR_ERR(new);
 445         mpol_free(current->mempolicy);
 446         current->mempolicy = new;
 447         if (new && new->policy == MPOL_INTERLEAVE)
 448                 current->il_next = first_node(new->v.nodes);
 449         return 0;
 450 }
 451
 452 /* Fill a zone bitmap for a policy */
 453 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 454 {
 455         int i;
 456
 457         nodes_clear(*nodes);
 458         switch (p->policy) {
 459         case MPOL_BIND:
 460                 for (i = 0; p->v.zonelist->zones[i]; i++)
 461                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 462                                 *nodes);
 463                 break;
 464         case MPOL_DEFAULT:
 465                 break;
 466         case MPOL_INTERLEAVE:
 467                 *nodes = p->v.nodes;
 468                 break;
 469         case MPOL_PREFERRED:
 470                 /* or use current node instead of online map? */
 471                 if (p->v.preferred_node < 0)
 472                         *nodes = node_online_map;
 473                 else
 474                         node_set(p->v.preferred_node, *nodes);
 475                 break;
 476         default:
 477                 BUG();
 478         }
 479 }
 480
 481 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 482 {
 483         struct page *p;
 484         int err;
 485
 486         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 487         if (err >= 0) {
 488                 err = page_to_nid(p);
 489                 put_page(p);
 490         }
 491         return err;
 492 }
 493
 494 /* Retrieve NUMA policy */
 495 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 496                         unsigned long addr, unsigned long flags)
 497 {
 498         int err;
 499         struct mm_struct *mm = current->mm;
 500         struct vm_area_struct *vma = NULL;
 501         struct mempolicy *pol = current->mempolicy;
 502
 503         cpuset_update_task_memory_state();
 504         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 505                 return -EINVAL;
 506         if (flags & MPOL_F_ADDR) {
 507                 down_read(&mm->mmap_sem);
 508                 vma = find_vma_intersection(mm, addr, addr+1);
 509                 if (!vma) {
 510                         up_read(&mm->mmap_sem);
 511                         return -EFAULT;
 512                 }
 513                 if (vma->vm_ops && vma->vm_ops->get_policy)
 514                         pol = vma->vm_ops->get_policy(vma, addr);
 515                 else
 516                         pol = vma->vm_policy;
 517         } else if (addr)
 518                 return -EINVAL;
 519
 520         if (!pol)
 521                 pol = &default_policy;
 522
 523         if (flags & MPOL_F_NODE) {
 524                 if (flags & MPOL_F_ADDR) {
 525                         err = lookup_node(mm, addr);
 526                         if (err < 0)
 527                                 goto out;
 528                         *policy = err;
 529                 } else if (pol == current->mempolicy &&
 530                                 pol->policy == MPOL_INTERLEAVE) {
 531                         *policy = current->il_next;
 532                 } else {
 533                         err = -EINVAL;
 534                         goto out;
 535                 }
 536         } else
 537                 *policy = pol->policy;
 538
 539         if (vma) {
 540                 up_read(&current->mm->mmap_sem);
 541                 vma = NULL;
 542         }
 543
 544         err = 0;
 545         if (nmask)
 546                 get_zonemask(pol, nmask);
 547
 548  out:
 549         if (vma)
 550                 up_read(&current->mm->mmap_sem);
 551         return err;
 552 }
 553
 554 /*
 555  * page migration
 556  */
 557
 558 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 559                                 unsigned long flags)
 560 {
 561         /*
 562          * Avoid migrating a page that is shared with others.
 563          */
 564         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 565                 if (isolate_lru_page(page))
 566                         list_add_tail(&page->lru, pagelist);
 567         }
 568 }
 569
 570 /*
 571  * Migrate the list 'pagelist' of pages to a certain destination.
 572  *
 573  * Specify destination with either non-NULL vma or dest_node >= 0
 574  * Return the number of pages not migrated or error code
 575  */
 576 static int migrate_pages_to(struct list_head *pagelist,
 577                         struct vm_area_struct *vma, int dest)
 578 {
 579         LIST_HEAD(newlist);
 580         LIST_HEAD(moved);
 581         LIST_HEAD(failed);
 582         int err = 0;
 583         unsigned long offset = 0;
 584         int nr_pages;
 585         struct page *page;
 586         struct list_head *p;
 587
 588 redo:
 589         nr_pages = 0;
 590         list_for_each(p, pagelist) {
 591                 if (vma) {
 592                         /*
 593                          * The address passed to alloc_page_vma is used to
 594                          * generate the proper interleave behavior. We fake
 595                          * the address here by an increasing offset in order
 596                          * to get the proper distribution of pages.
 597                          *
 598                          * No decision has been made as to which page
 599                          * a certain old page is moved to so we cannot
 600                          * specify the correct address.
 601                          */
 602                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 603                                         offset + vma->vm_start);
 604                         offset += PAGE_SIZE;
 605                 }
 606                 else
 607                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 608
 609                 if (!page) {
 610                         err = -ENOMEM;
 611                         goto out;
 612                 }
 613                 list_add_tail(&page->lru, &newlist);
 614                 nr_pages++;
 615                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 616                         break;
 617         }
 618         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 619
 620         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 621
 622         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 623                 goto redo;
 624 out:
 625         /* Return leftover allocated pages */
 626         while (!list_empty(&newlist)) {
 627                 page = list_entry(newlist.next, struct page, lru);
 628                 list_del(&page->lru);
 629                 __free_page(page);
 630         }
 631         list_splice(&failed, pagelist);
 632         if (err < 0)
 633                 return err;
 634
 635         /* Calculate number of leftover pages */
 636         nr_pages = 0;
 637         list_for_each(p, pagelist)
 638                 nr_pages++;
 639         return nr_pages;
 640 }
 641
 642 /*
 643  * Migrate pages from one node to a target node.
 644  * Returns error or the number of pages not migrated.
 645  */
 646 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 647 {
 648         nodemask_t nmask;
 649         LIST_HEAD(pagelist);
 650         int err = 0;
 651
 652         nodes_clear(nmask);
 653         node_set(source, nmask);
 654
 655         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 656                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 657
 658         if (!list_empty(&pagelist)) {
 659                 err = migrate_pages_to(&pagelist, NULL, dest);
 660                 if (!list_empty(&pagelist))
 661                         putback_lru_pages(&pagelist);
 662         }
 663         return err;
 664 }
 665
 666 /*
 667  * Move pages between the two nodesets so as to preserve the physical
 668  * layout as much as possible.
 669  *
 670  * Returns the number of page that could not be moved.
 671  */
 672 int do_migrate_pages(struct mm_struct *mm,
 673         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 674 {
 675         LIST_HEAD(pagelist);
 676         int busy = 0;
 677         int err = 0;
 678         nodemask_t tmp;
 679
 680         down_read(&mm->mmap_sem);
 681
 682 /*
 683  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 684  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 685  * bit in 'tmp', and return that <source, dest> pair for migration.
 686  * The pair of nodemasks 'to' and 'from' define the map.
 687  *
 688  * If no pair of bits is found that way, fallback to picking some
 689  * pair of 'source' and 'dest' bits that are not the same.  If the
 690  * 'source' and 'dest' bits are the same, this represents a node
 691  * that will be migrating to itself, so no pages need move.
 692  *
 693  * If no bits are left in 'tmp', or if all remaining bits left
 694  * in 'tmp' correspond to the same bit in 'to', return false
 695  * (nothing left to migrate).
 696  *
 697  * This lets us pick a pair of nodes to migrate between, such that
 698  * if possible the dest node is not already occupied by some other
 699  * source node, minimizing the risk of overloading the memory on a
 700  * node that would happen if we migrated incoming memory to a node
 701  * before migrating outgoing memory source that same node.
 702  *
 703  * A single scan of tmp is sufficient.  As we go, we remember the
 704  * most recent <s, d> pair that moved (s != d).  If we find a pair
 705  * that not only moved, but what's better, moved to an empty slot
 706  * (d is not set in tmp), then we break out then, with that pair.
 707  * Otherwise when we finish scannng from_tmp, we at least have the
 708  * most recent <s, d> pair that moved.  If we get all the way through
 709  * the scan of tmp without finding any node that moved, much less
 710  * moved to an empty node, then there is nothing left worth migrating.
 711  */
 712
 713         tmp = *from_nodes;
 714         while (!nodes_empty(tmp)) {
 715                 int s,d;
 716                 int source = -1;
 717                 int dest = 0;
 718
 719                 for_each_node_mask(s, tmp) {
 720                         d = node_remap(s, *from_nodes, *to_nodes);
 721                         if (s == d)
 722                                 continue;
 723
 724                         source = s;     /* Node moved. Memorize */
 725                         dest = d;
 726
 727                         /* dest not in remaining from nodes? */
 728                         if (!node_isset(dest, tmp))
 729                                 break;
 730                 }
 731                 if (source == -1)
 732                         break;
 733
 734                 node_clear(source, tmp);
 735                 err = migrate_to_node(mm, source, dest, flags);
 736                 if (err > 0)
 737                         busy += err;
 738                 if (err < 0)
 739                         break;
 740         }
 741
 742         up_read(&mm->mmap_sem);
 743         if (err < 0)
 744                 return err;
 745         return busy;
 746 }
 747
 748 long do_mbind(unsigned long start, unsigned long len,
 749                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 750 {
 751         struct vm_area_struct *vma;
 752         struct mm_struct *mm = current->mm;
 753         struct mempolicy *new;
 754         unsigned long end;
 755         int err;
 756         LIST_HEAD(pagelist);
 757
 758         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 759                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 760             || mode > MPOL_MAX)
 761                 return -EINVAL;
 762         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 763                 return -EPERM;
 764
 765         if (start & ~PAGE_MASK)
 766                 return -EINVAL;
 767
 768         if (mode == MPOL_DEFAULT)
 769                 flags &= ~MPOL_MF_STRICT;
 770
 771         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 772         end = start + len;
 773
 774         if (end < start)
 775                 return -EINVAL;
 776         if (end == start)
 777                 return 0;
 778
 779         if (mpol_check_policy(mode, nmask))
 780                 return -EINVAL;
 781
 782         new = mpol_new(mode, nmask);
 783         if (IS_ERR(new))
 784                 return PTR_ERR(new);
 785
 786         /*
 787          * If we are using the default policy then operation
 788          * on discontinuous address spaces is okay after all
 789          */
 790         if (!new)
 791                 flags |= MPOL_MF_DISCONTIG_OK;
 792
 793         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 794                         mode,nodes_addr(nodes)[0]);
 795
 796         down_write(&mm->mmap_sem);
 797         vma = check_range(mm, start, end, nmask,
 798                           flags | MPOL_MF_INVERT, &pagelist);
 799
 800         err = PTR_ERR(vma);
 801         if (!IS_ERR(vma)) {
 802                 int nr_failed = 0;
 803
 804                 err = mbind_range(vma, start, end, new);
 805
 806                 if (!list_empty(&pagelist))
 807                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 808
 809                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 810                         err = -EIO;
 811         }
 812         if (!list_empty(&pagelist))
 813                 putback_lru_pages(&pagelist);
 814
 815         up_write(&mm->mmap_sem);
 816         mpol_free(new);
 817         return err;
 818 }
 819
 820 /*
 821  * User space interface with variable sized bitmaps for nodelists.
 822  */
 823
 824 /* Copy a node mask from user space. */
 825 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 826                      unsigned long maxnode)
 827 {
 828         unsigned long k;
 829         unsigned long nlongs;
 830         unsigned long endmask;
 831
 832         --maxnode;
 833         nodes_clear(*nodes);
 834         if (maxnode == 0 || !nmask)
 835                 return 0;
 836         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 837                 return -EINVAL;
 838
 839         nlongs = BITS_TO_LONGS(maxnode);
 840         if ((maxnode % BITS_PER_LONG) == 0)
 841                 endmask = ~0UL;
 842         else
 843                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 844
 845         /* When the user specified more nodes than supported just check
 846            if the non supported part is all zero. */
 847         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 848                 if (nlongs > PAGE_SIZE/sizeof(long))
 849                         return -EINVAL;
 850                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 851                         unsigned long t;
 852                         if (get_user(t, nmask + k))
 853                                 return -EFAULT;
 854                         if (k == nlongs - 1) {
 855                                 if (t & endmask)
 856                                         return -EINVAL;
 857                         } else if (t)
 858                                 return -EINVAL;
 859                 }
 860                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 861                 endmask = ~0UL;
 862         }
 863
 864         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 865                 return -EFAULT;
 866         nodes_addr(*nodes)[nlongs-1] &= endmask;
 867         return 0;
 868 }
 869
 870 /* Copy a kernel node mask to user space */
 871 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 872                               nodemask_t *nodes)
 873 {
 874         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 875         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 876
 877         if (copy > nbytes) {
 878                 if (copy > PAGE_SIZE)
 879                         return -EINVAL;
 880                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 881                         return -EFAULT;
 882                 copy = nbytes;
 883         }
 884         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 885 }
 886
 887 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 888                         unsigned long mode,
 889                         unsigned long __user *nmask, unsigned long maxnode,
 890                         unsigned flags)
 891 {
 892         nodemask_t nodes;
 893         int err;
 894
 895         err = get_nodes(&nodes, nmask, maxnode);
 896         if (err)
 897                 return err;
 898         return do_mbind(start, len, mode, &nodes, flags);
 899 }
 900
 901 /* Set the process memory policy */
 902 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 903                 unsigned long maxnode)
 904 {
 905         int err;
 906         nodemask_t nodes;
 907
 908         if (mode < 0 || mode > MPOL_MAX)
 909                 return -EINVAL;
 910         err = get_nodes(&nodes, nmask, maxnode);
 911         if (err)
 912                 return err;
 913         return do_set_mempolicy(mode, &nodes);
 914 }
 915
 916 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 917                 const unsigned long __user *old_nodes,
 918                 const unsigned long __user *new_nodes)
 919 {
 920         struct mm_struct *mm;
 921         struct task_struct *task;
 922         nodemask_t old;
 923         nodemask_t new;
 924         nodemask_t task_nodes;
 925         int err;
 926
 927         err = get_nodes(&old, old_nodes, maxnode);
 928         if (err)
 929                 return err;
 930
 931         err = get_nodes(&new, new_nodes, maxnode);
 932         if (err)
 933                 return err;
 934
 935         /* Find the mm_struct */
 936         read_lock(&tasklist_lock);
 937         task = pid ? find_task_by_pid(pid) : current;
 938         if (!task) {
 939                 read_unlock(&tasklist_lock);
 940                 return -ESRCH;
 941         }
 942         mm = get_task_mm(task);
 943         read_unlock(&tasklist_lock);
 944
 945         if (!mm)
 946                 return -EINVAL;
 947
 948         /*
 949          * Check if this process has the right to modify the specified
 950          * process. The right exists if the process has administrative
 951          * capabilities, superuser priviledges or the same
 952          * userid as the target process.
 953          */
 954         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 955             (current->uid != task->suid) && (current->uid != task->uid) &&
 956             !capable(CAP_SYS_NICE)) {
 957                 err = -EPERM;
 958                 goto out;
 959         }
 960
 961         task_nodes = cpuset_mems_allowed(task);
 962         /* Is the user allowed to access the target nodes? */
 963         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 964                 err = -EPERM;
 965                 goto out;
 966         }
 967
 968         err = do_migrate_pages(mm, &old, &new,
 969                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 970 out:
 971         mmput(mm);
 972         return err;
 973 }
 974
 975
 976 /* Retrieve NUMA policy */
 977 asmlinkage long sys_get_mempolicy(int __user *policy,
 978                                 unsigned long __user *nmask,
 979                                 unsigned long maxnode,
 980                                 unsigned long addr, unsigned long flags)
 981 {
 982         int err, pval;
 983         nodemask_t nodes;
 984
 985         if (nmask != NULL && maxnode < MAX_NUMNODES)
 986                 return -EINVAL;
 987
 988         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 989
 990         if (err)
 991                 return err;
 992
 993         if (policy && put_user(pval, policy))
 994                 return -EFAULT;
 995
 996         if (nmask)
 997                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 998
 999         return err;
1000 }
1001
1002 #ifdef CONFIG_COMPAT
1003
1004 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1005                                      compat_ulong_t __user *nmask,
1006                                      compat_ulong_t maxnode,
1007                                      compat_ulong_t addr, compat_ulong_t flags)
1008 {
1009         long err;
1010         unsigned long __user *nm = NULL;
1011         unsigned long nr_bits, alloc_size;
1012         DECLARE_BITMAP(bm, MAX_NUMNODES);
1013
1014         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1015         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1016
1017         if (nmask)
1018                 nm = compat_alloc_user_space(alloc_size);
1019
1020         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1021
1022         if (!err && nmask) {
1023                 err = copy_from_user(bm, nm, alloc_size);
1024                 /* ensure entire bitmap is zeroed */
1025                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1026                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1027         }
1028
1029         return err;
1030 }
1031
1032 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1033                                      compat_ulong_t maxnode)
1034 {
1035         long err = 0;
1036         unsigned long __user *nm = NULL;
1037         unsigned long nr_bits, alloc_size;
1038         DECLARE_BITMAP(bm, MAX_NUMNODES);
1039
1040         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1041         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1042
1043         if (nmask) {
1044                 err = compat_get_bitmap(bm, nmask, nr_bits);
1045                 nm = compat_alloc_user_space(alloc_size);
1046                 err |= copy_to_user(nm, bm, alloc_size);
1047         }
1048
1049         if (err)
1050                 return -EFAULT;
1051
1052         return sys_set_mempolicy(mode, nm, nr_bits+1);
1053 }
1054
1055 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1056                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1057                              compat_ulong_t maxnode, compat_ulong_t flags)
1058 {
1059         long err = 0;
1060         unsigned long __user *nm = NULL;
1061         unsigned long nr_bits, alloc_size;
1062         nodemask_t bm;
1063
1064         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1065         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1066
1067         if (nmask) {
1068                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1069                 nm = compat_alloc_user_space(alloc_size);
1070                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1071         }
1072
1073         if (err)
1074                 return -EFAULT;
1075
1076         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1077 }
1078
1079 #endif
1080
1081 /* Return effective policy for a VMA */
1082 static struct mempolicy * get_vma_policy(struct task_struct *task,
1083                 struct vm_area_struct *vma, unsigned long addr)
1084 {
1085         struct mempolicy *pol = task->mempolicy;
1086
1087         if (vma) {
1088                 if (vma->vm_ops && vma->vm_ops->get_policy)
1089                         pol = vma->vm_ops->get_policy(vma, addr);
1090                 else if (vma->vm_policy &&
1091                                 vma->vm_policy->policy != MPOL_DEFAULT)
1092                         pol = vma->vm_policy;
1093         }
1094         if (!pol)
1095                 pol = &default_policy;
1096         return pol;
1097 }
1098
1099 /* Return a zonelist representing a mempolicy */
1100 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1101 {
1102         int nd;
1103
1104         switch (policy->policy) {
1105         case MPOL_PREFERRED:
1106                 nd = policy->v.preferred_node;
1107                 if (nd < 0)
1108                         nd = numa_node_id();
1109                 break;
1110         case MPOL_BIND:
1111                 /* Lower zones don't get a policy applied */
1112                 /* Careful: current->mems_allowed might have moved */
1113                 if (gfp_zone(gfp) >= policy_zone)
1114                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1115                                 return policy->v.zonelist;
1116                 /*FALL THROUGH*/
1117         case MPOL_INTERLEAVE: /* should not happen */
1118         case MPOL_DEFAULT:
1119                 nd = numa_node_id();
1120                 break;
1121         default:
1122                 nd = 0;
1123                 BUG();
1124         }
1125         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1126 }
1127
1128 /* Do dynamic interleaving for a process */
1129 static unsigned interleave_nodes(struct mempolicy *policy)
1130 {
1131         unsigned nid, next;
1132         struct task_struct *me = current;
1133
1134         nid = me->il_next;
1135         next = next_node(nid, policy->v.nodes);
1136         if (next >= MAX_NUMNODES)
1137                 next = first_node(policy->v.nodes);
1138         me->il_next = next;
1139         return nid;
1140 }
1141
1142 /*
1143  * Depending on the memory policy provide a node from which to allocate the
1144  * next slab entry.
1145  */
1146 unsigned slab_node(struct mempolicy *policy)
1147 {
1148         switch (policy->policy) {
1149         case MPOL_INTERLEAVE:
1150                 return interleave_nodes(policy);
1151
1152         case MPOL_BIND:
1153                 /*
1154                  * Follow bind policy behavior and start allocation at the
1155                  * first node.
1156                  */
1157                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1158
1159         case MPOL_PREFERRED:
1160                 if (policy->v.preferred_node >= 0)
1161                         return policy->v.preferred_node;
1162                 /* Fall through */
1163
1164         default:
1165                 return numa_node_id();
1166         }
1167 }
1168
1169 /* Do static interleaving for a VMA with known offset. */
1170 static unsigned offset_il_node(struct mempolicy *pol,
1171                 struct vm_area_struct *vma, unsigned long off)
1172 {
1173         unsigned nnodes = nodes_weight(pol->v.nodes);
1174         unsigned target = (unsigned)off % nnodes;
1175         int c;
1176         int nid = -1;
1177
1178         c = 0;
1179         do {
1180                 nid = next_node(nid, pol->v.nodes);
1181                 c++;
1182         } while (c <= target);
1183         return nid;
1184 }
1185
1186 /* Determine a node number for interleave */
1187 static inline unsigned interleave_nid(struct mempolicy *pol,
1188                  struct vm_area_struct *vma, unsigned long addr, int shift)
1189 {
1190         if (vma) {
1191                 unsigned long off;
1192
1193                 off = vma->vm_pgoff;
1194                 off += (addr - vma->vm_start) >> shift;
1195                 return offset_il_node(pol, vma, off);
1196         } else
1197                 return interleave_nodes(pol);
1198 }
1199
1200 #ifdef CONFIG_HUGETLBFS
1201 /* Return a zonelist suitable for a huge page allocation. */
1202 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1203 {
1204         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1205
1206         if (pol->policy == MPOL_INTERLEAVE) {
1207                 unsigned nid;
1208
1209                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1210                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1211         }
1212         return zonelist_policy(GFP_HIGHUSER, pol);
1213 }
1214 #endif
1215
1216 /* Allocate a page in interleaved policy.
1217    Own path because it needs to do special accounting. */
1218 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1219                                         unsigned nid)
1220 {
1221         struct zonelist *zl;
1222         struct page *page;
1223
1224         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1225         page = __alloc_pages(gfp, order, zl);
1226         if (page && page_zone(page) == zl->zones[0]) {
1227                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1228                 put_cpu();
1229         }
1230         return page;
1231 }
1232
1233 /**
1234  *      alloc_page_vma  - Allocate a page for a VMA.
1235  *
1236  *      @gfp:
1237  *      %GFP_USER    user allocation.
1238  *      %GFP_KERNEL  kernel allocations,
1239  *      %GFP_HIGHMEM highmem/user allocations,
1240  *      %GFP_FS      allocation should not call back into a file system.
1241  *      %GFP_ATOMIC  don't sleep.
1242  *
1243  *      @vma:  Pointer to VMA or NULL if not available.
1244  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1245  *
1246  *      This function allocates a page from the kernel page pool and applies
1247  *      a NUMA policy associated with the VMA or the current process.
1248  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1249  *      mm_struct of the VMA to prevent it from going away. Should be used for
1250  *      all allocations for pages that will be mapped into
1251  *      user space. Returns NULL when no page can be allocated.
1252  *
1253  *      Should be called with the mm_sem of the vma hold.
1254  */
1255 struct page *
1256 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1257 {
1258         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1259
1260         cpuset_update_task_memory_state();
1261
1262         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1263                 unsigned nid;
1264
1265                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1266                 return alloc_page_interleave(gfp, 0, nid);
1267         }
1268         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1269 }
1270
1271 /**
1272  *      alloc_pages_current - Allocate pages.
1273  *
1274  *      @gfp:
1275  *              %GFP_USER   user allocation,
1276  *              %GFP_KERNEL kernel allocation,
1277  *              %GFP_HIGHMEM highmem allocation,
1278  *              %GFP_FS     don't call back into a file system.
1279  *              %GFP_ATOMIC don't sleep.
1280  *      @order: Power of two of allocation size in pages. 0 is a single page.
1281  *
1282  *      Allocate a page from the kernel page pool.  When not in
1283  *      interrupt context and apply the current process NUMA policy.
1284  *      Returns NULL when no page can be allocated.
1285  *
1286  *      Don't call cpuset_update_task_memory_state() unless
1287  *      1) it's ok to take cpuset_sem (can WAIT), and
1288  *      2) allocating for current task (not interrupt).
1289  */
1290 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1291 {
1292         struct mempolicy *pol = current->mempolicy;
1293
1294         if ((gfp & __GFP_WAIT) && !in_interrupt())
1295                 cpuset_update_task_memory_state();
1296         if (!pol || in_interrupt())
1297                 pol = &default_policy;
1298         if (pol->policy == MPOL_INTERLEAVE)
1299                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1300         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1301 }
1302 EXPORT_SYMBOL(alloc_pages_current);
1303
1304 /*
1305  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1306  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1307  * with the mems_allowed returned by cpuset_mems_allowed().  This
1308  * keeps mempolicies cpuset relative after its cpuset moves.  See
1309  * further kernel/cpuset.c update_nodemask().
1310  */
1311 void *cpuset_being_rebound;
1312
1313 /* Slow path of a mempolicy copy */
1314 struct mempolicy *__mpol_copy(struct mempolicy *old)
1315 {
1316         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1317
1318         if (!new)
1319                 return ERR_PTR(-ENOMEM);
1320         if (current_cpuset_is_being_rebound()) {
1321                 nodemask_t mems = cpuset_mems_allowed(current);
1322                 mpol_rebind_policy(old, &mems);
1323         }
1324         *new = *old;
1325         atomic_set(&new->refcnt, 1);
1326         if (new->policy == MPOL_BIND) {
1327                 int sz = ksize(old->v.zonelist);
1328                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1329                 if (!new->v.zonelist) {
1330                         kmem_cache_free(policy_cache, new);
1331                         return ERR_PTR(-ENOMEM);
1332                 }
1333                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1334         }
1335         return new;
1336 }
1337
1338 /* Slow path of a mempolicy comparison */
1339 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1340 {
1341         if (!a || !b)
1342                 return 0;
1343         if (a->policy != b->policy)
1344                 return 0;
1345         switch (a->policy) {
1346         case MPOL_DEFAULT:
1347                 return 1;
1348         case MPOL_INTERLEAVE:
1349                 return nodes_equal(a->v.nodes, b->v.nodes);
1350         case MPOL_PREFERRED:
1351                 return a->v.preferred_node == b->v.preferred_node;
1352         case MPOL_BIND: {
1353                 int i;
1354                 for (i = 0; a->v.zonelist->zones[i]; i++)
1355                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1356                                 return 0;
1357                 return b->v.zonelist->zones[i] == NULL;
1358         }
1359         default:
1360                 BUG();
1361                 return 0;
1362         }
1363 }
1364
1365 /* Slow path of a mpol destructor. */
1366 void __mpol_free(struct mempolicy *p)
1367 {
1368         if (!atomic_dec_and_test(&p->refcnt))
1369                 return;
1370         if (p->policy == MPOL_BIND)
1371                 kfree(p->v.zonelist);
1372         p->policy = MPOL_DEFAULT;
1373         kmem_cache_free(policy_cache, p);
1374 }
1375
1376 /*
1377  * Shared memory backing store policy support.
1378  *
1379  * Remember policies even when nobody has shared memory mapped.
1380  * The policies are kept in Red-Black tree linked from the inode.
1381  * They are protected by the sp->lock spinlock, which should be held
1382  * for any accesses to the tree.
1383  */
1384
1385 /* lookup first element intersecting start-end */
1386 /* Caller holds sp->lock */
1387 static struct sp_node *
1388 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1389 {
1390         struct rb_node *n = sp->root.rb_node;
1391
1392         while (n) {
1393                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1394
1395                 if (start >= p->end)
1396                         n = n->rb_right;
1397                 else if (end <= p->start)
1398                         n = n->rb_left;
1399                 else
1400                         break;
1401         }
1402         if (!n)
1403                 return NULL;
1404         for (;;) {
1405                 struct sp_node *w = NULL;
1406                 struct rb_node *prev = rb_prev(n);
1407                 if (!prev)
1408                         break;
1409                 w = rb_entry(prev, struct sp_node, nd);
1410                 if (w->end <= start)
1411                         break;
1412                 n = prev;
1413         }
1414         return rb_entry(n, struct sp_node, nd);
1415 }
1416
1417 /* Insert a new shared policy into the list. */
1418 /* Caller holds sp->lock */
1419 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1420 {
1421         struct rb_node **p = &sp->root.rb_node;
1422         struct rb_node *parent = NULL;
1423         struct sp_node *nd;
1424
1425         while (*p) {
1426                 parent = *p;
1427                 nd = rb_entry(parent, struct sp_node, nd);
1428                 if (new->start < nd->start)
1429                         p = &(*p)->rb_left;
1430                 else if (new->end > nd->end)
1431                         p = &(*p)->rb_right;
1432                 else
1433                         BUG();
1434         }
1435         rb_link_node(&new->nd, parent, p);
1436         rb_insert_color(&new->nd, &sp->root);
1437         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1438                  new->policy ? new->policy->policy : 0);
1439 }
1440
1441 /* Find shared policy intersecting idx */
1442 struct mempolicy *
1443 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1444 {
1445         struct mempolicy *pol = NULL;
1446         struct sp_node *sn;
1447
1448         if (!sp->root.rb_node)
1449                 return NULL;
1450         spin_lock(&sp->lock);
1451         sn = sp_lookup(sp, idx, idx+1);
1452         if (sn) {
1453                 mpol_get(sn->policy);
1454                 pol = sn->policy;
1455         }
1456         spin_unlock(&sp->lock);
1457         return pol;
1458 }
1459
1460 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461 {
1462         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1463         rb_erase(&n->nd, &sp->root);
1464         mpol_free(n->policy);
1465         kmem_cache_free(sn_cache, n);
1466 }
1467
1468 struct sp_node *
1469 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1470 {
1471         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1472
1473         if (!n)
1474                 return NULL;
1475         n->start = start;
1476         n->end = end;
1477         mpol_get(pol);
1478         n->policy = pol;
1479         return n;
1480 }
1481
1482 /* Replace a policy range. */
1483 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1484                                  unsigned long end, struct sp_node *new)
1485 {
1486         struct sp_node *n, *new2 = NULL;
1487
1488 restart:
1489         spin_lock(&sp->lock);
1490         n = sp_lookup(sp, start, end);
1491         /* Take care of old policies in the same range. */
1492         while (n && n->start < end) {
1493                 struct rb_node *next = rb_next(&n->nd);
1494                 if (n->start >= start) {
1495                         if (n->end <= end)
1496                                 sp_delete(sp, n);
1497                         else
1498                                 n->start = end;
1499                 } else {
1500                         /* Old policy spanning whole new range. */
1501                         if (n->end > end) {
1502                                 if (!new2) {
1503                                         spin_unlock(&sp->lock);
1504                                         new2 = sp_alloc(end, n->end, n->policy);
1505                                         if (!new2)
1506                                                 return -ENOMEM;
1507                                         goto restart;
1508                                 }
1509                                 n->end = start;
1510                                 sp_insert(sp, new2);
1511                                 new2 = NULL;
1512                                 break;
1513                         } else
1514                                 n->end = start;
1515                 }
1516                 if (!next)
1517                         break;
1518                 n = rb_entry(next, struct sp_node, nd);
1519         }
1520         if (new)
1521                 sp_insert(sp, new);
1522         spin_unlock(&sp->lock);
1523         if (new2) {
1524                 mpol_free(new2->policy);
1525                 kmem_cache_free(sn_cache, new2);
1526         }
1527         return 0;
1528 }
1529
1530 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1531                                 nodemask_t *policy_nodes)
1532 {
1533         info->root = RB_ROOT;
1534         spin_lock_init(&info->lock);
1535
1536         if (policy != MPOL_DEFAULT) {
1537                 struct mempolicy *newpol;
1538
1539                 /* Falls back to MPOL_DEFAULT on any error */
1540                 newpol = mpol_new(policy, policy_nodes);
1541                 if (!IS_ERR(newpol)) {
1542                         /* Create pseudo-vma that contains just the policy */
1543                         struct vm_area_struct pvma;
1544
1545                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1546                         /* Policy covers entire file */
1547                         pvma.vm_end = TASK_SIZE;
1548                         mpol_set_shared_policy(info, &pvma, newpol);
1549                         mpol_free(newpol);
1550                 }
1551         }
1552 }
1553
1554 int mpol_set_shared_policy(struct shared_policy *info,
1555                         struct vm_area_struct *vma, struct mempolicy *npol)
1556 {
1557         int err;
1558         struct sp_node *new = NULL;
1559         unsigned long sz = vma_pages(vma);
1560
1561         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1562                  vma->vm_pgoff,
1563                  sz, npol? npol->policy : -1,
1564                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565
1566         if (npol) {
1567                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1568                 if (!new)
1569                         return -ENOMEM;
1570         }
1571         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1572         if (err && new)
1573                 kmem_cache_free(sn_cache, new);
1574         return err;
1575 }
1576
1577 /* Free a backing policy store on inode delete. */
1578 void mpol_free_shared_policy(struct shared_policy *p)
1579 {
1580         struct sp_node *n;
1581         struct rb_node *next;
1582
1583         if (!p->root.rb_node)
1584                 return;
1585         spin_lock(&p->lock);
1586         next = rb_first(&p->root);
1587         while (next) {
1588                 n = rb_entry(next, struct sp_node, nd);
1589                 next = rb_next(&n->nd);
1590                 rb_erase(&n->nd, &p->root);
1591                 mpol_free(n->policy);
1592                 kmem_cache_free(sn_cache, n);
1593         }
1594         spin_unlock(&p->lock);
1595 }
1596
1597 /* assumes fs == KERNEL_DS */
1598 void __init numa_policy_init(void)
1599 {
1600         policy_cache = kmem_cache_create("numa_policy",
1601                                          sizeof(struct mempolicy),
1602                                          0, SLAB_PANIC, NULL, NULL);
1603
1604         sn_cache = kmem_cache_create("shared_policy_node",
1605                                      sizeof(struct sp_node),
1606                                      0, SLAB_PANIC, NULL, NULL);
1607
1608         /* Set interleaving policy for system init. This way not all
1609            the data structures allocated at system boot end up in node zero. */
1610
1611         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1612                 printk("numa_policy_init: interleaving failed\n");
1613 }
1614
1615 /* Reset policy of current process to default */
1616 void numa_default_policy(void)
1617 {
1618         do_set_mempolicy(MPOL_DEFAULT, NULL);
1619 }
1620
1621 /* Migrate a policy to a different set of nodes */
1622 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1623 {
1624         nodemask_t *mpolmask;
1625         nodemask_t tmp;
1626
1627         if (!pol)
1628                 return;
1629         mpolmask = &pol->cpuset_mems_allowed;
1630         if (nodes_equal(*mpolmask, *newmask))
1631                 return;
1632
1633         switch (pol->policy) {
1634         case MPOL_DEFAULT:
1635                 break;
1636         case MPOL_INTERLEAVE:
1637                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1638                 pol->v.nodes = tmp;
1639                 *mpolmask = *newmask;
1640                 current->il_next = node_remap(current->il_next,
1641                                                 *mpolmask, *newmask);
1642                 break;
1643         case MPOL_PREFERRED:
1644                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1645                                                 *mpolmask, *newmask);
1646                 *mpolmask = *newmask;
1647                 break;
1648         case MPOL_BIND: {
1649                 nodemask_t nodes;
1650                 struct zone **z;
1651                 struct zonelist *zonelist;
1652
1653                 nodes_clear(nodes);
1654                 for (z = pol->v.zonelist->zones; *z; z++)
1655                         node_set((*z)->zone_pgdat->node_id, nodes);
1656                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1657                 nodes = tmp;
1658
1659                 zonelist = bind_zonelist(&nodes);
1660
1661                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1662                  * If that old zonelist has no remaining mems_allowed nodes,
1663                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1664                  */
1665
1666                 if (zonelist) {
1667                         /* Good - got mem - substitute new zonelist */
1668                         kfree(pol->v.zonelist);
1669                         pol->v.zonelist = zonelist;
1670                 }
1671                 *mpolmask = *newmask;
1672                 break;
1673         }
1674         default:
1675                 BUG();
1676                 break;
1677         }
1678 }
1679
1680 /*
1681  * Wrapper for mpol_rebind_policy() that just requires task
1682  * pointer, and updates task mempolicy.
1683  */
1684
1685 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1686 {
1687         mpol_rebind_policy(tsk->mempolicy, new);
1688 }
1689
1690 /*
1691  * Rebind each vma in mm to new nodemask.
1692  *
1693  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1694  */
1695
1696 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1697 {
1698         struct vm_area_struct *vma;
1699
1700         down_write(&mm->mmap_sem);
1701         for (vma = mm->mmap; vma; vma = vma->vm_next)
1702                 mpol_rebind_policy(vma->vm_policy, new);
1703         up_write(&mm->mmap_sem);
1704 }
1705
1706 /*
1707  * Display pages allocated per node and memory policy via /proc.
1708  */
1709
1710 static const char *policy_types[] = { "default", "prefer", "bind",
1711                                       "interleave" };
1712
1713 /*
1714  * Convert a mempolicy into a string.
1715  * Returns the number of characters in buffer (if positive)
1716  * or an error (negative)
1717  */
1718 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1719 {
1720         char *p = buffer;
1721         int l;
1722         nodemask_t nodes;
1723         int mode = pol ? pol->policy : MPOL_DEFAULT;
1724
1725         switch (mode) {
1726         case MPOL_DEFAULT:
1727                 nodes_clear(nodes);
1728                 break;
1729
1730         case MPOL_PREFERRED:
1731                 nodes_clear(nodes);
1732                 node_set(pol->v.preferred_node, nodes);
1733                 break;
1734
1735         case MPOL_BIND:
1736                 get_zonemask(pol, &nodes);
1737                 break;
1738
1739         case MPOL_INTERLEAVE:
1740                 nodes = pol->v.nodes;
1741                 break;
1742
1743         default:
1744                 BUG();
1745                 return -EFAULT;
1746         }
1747
1748         l = strlen(policy_types[mode]);
1749         if (buffer + maxlen < p + l + 1)
1750                 return -ENOSPC;
1751
1752         strcpy(p, policy_types[mode]);
1753         p += l;
1754
1755         if (!nodes_empty(nodes)) {
1756                 if (buffer + maxlen < p + 2)
1757                         return -ENOSPC;
1758                 *p++ = '=';
1759                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1760         }
1761         return p - buffer;
1762 }
1763
1764 struct numa_maps {
1765         unsigned long pages;
1766         unsigned long anon;
1767         unsigned long active;
1768         unsigned long writeback;
1769         unsigned long mapcount_max;
1770         unsigned long dirty;
1771         unsigned long swapcache;
1772         unsigned long node[MAX_NUMNODES];
1773 };
1774
1775 static void gather_stats(struct page *page, void *private, int pte_dirty)
1776 {
1777         struct numa_maps *md = private;
1778         int count = page_mapcount(page);
1779
1780         md->pages++;
1781         if (pte_dirty || PageDirty(page))
1782                 md->dirty++;
1783
1784         if (PageSwapCache(page))
1785                 md->swapcache++;
1786
1787         if (PageActive(page))
1788                 md->active++;
1789
1790         if (PageWriteback(page))
1791                 md->writeback++;
1792
1793         if (PageAnon(page))
1794                 md->anon++;
1795
1796         if (count > md->mapcount_max)
1797                 md->mapcount_max = count;
1798
1799         md->node[page_to_nid(page)]++;
1800 }
1801
1802 #ifdef CONFIG_HUGETLB_PAGE
1803 static void check_huge_range(struct vm_area_struct *vma,
1804                 unsigned long start, unsigned long end,
1805                 struct numa_maps *md)
1806 {
1807         unsigned long addr;
1808         struct page *page;
1809
1810         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812                 pte_t pte;
1813
1814                 if (!ptep)
1815                         continue;
1816
1817                 pte = *ptep;
1818                 if (pte_none(pte))
1819                         continue;
1820
1821                 page = pte_page(pte);
1822                 if (!page)
1823                         continue;
1824
1825                 gather_stats(page, md, pte_dirty(*ptep));
1826         }
1827 }
1828 #else
1829 static inline void check_huge_range(struct vm_area_struct *vma,
1830                 unsigned long start, unsigned long end,
1831                 struct numa_maps *md)
1832 {
1833 }
1834 #endif
1835
1836 int show_numa_map(struct seq_file *m, void *v)
1837 {
1838         struct task_struct *task = m->private;
1839         struct vm_area_struct *vma = v;
1840         struct numa_maps *md;
1841         struct file *file = vma->vm_file;
1842         struct mm_struct *mm = vma->vm_mm;
1843         int n;
1844         char buffer[50];
1845
1846         if (!mm)
1847                 return 0;
1848
1849         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1850         if (!md)
1851                 return 0;
1852
1853         mpol_to_str(buffer, sizeof(buffer),
1854                         get_vma_policy(task, vma, vma->vm_start));
1855
1856         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1857
1858         if (file) {
1859                 seq_printf(m, " file=");
1860                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1861         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862                 seq_printf(m, " heap");
1863         } else if (vma->vm_start <= mm->start_stack &&
1864                         vma->vm_end >= mm->start_stack) {
1865                 seq_printf(m, " stack");
1866         }
1867
1868         if (is_vm_hugetlb_page(vma)) {
1869                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870                 seq_printf(m, " huge");
1871         } else {
1872                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873                                 &node_online_map, MPOL_MF_STATS, md);
1874         }
1875
1876         if (!md->pages)
1877                 goto out;
1878
1879         if (md->anon)
1880                 seq_printf(m," anon=%lu",md->anon);
1881
1882         if (md->dirty)
1883                 seq_printf(m," dirty=%lu",md->dirty);
1884
1885         if (md->pages != md->anon && md->pages != md->dirty)
1886                 seq_printf(m, " mapped=%lu", md->pages);
1887
1888         if (md->mapcount_max > 1)
1889                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891         if (md->swapcache)
1892                 seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895                 seq_printf(m," active=%lu", md->active);
1896
1897         if (md->writeback)
1898                 seq_printf(m," writeback=%lu", md->writeback);
1899
1900         for_each_online_node(n)
1901                 if (md->node[n])
1902                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1903 out:
1904         seq_putc(m, '\n');
1905         kfree(md);
1906
1907         if (m->count < m->size)
1908                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1909         return 0;
1910 }
1911