mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Locking:
  22  * - the page->mapcount field is protected by the PG_maplock bit,
  23  *   which nests within the mm->page_table_lock,
  24  *   which nests within the page lock.
  25  * - because swapout locking is opposite to the locking order
  26  *   in the page fault path, the swapout path uses trylocks
  27  *   on the mm->page_table_lock
  28  */
  29 #include <linux/mm.h>
  30 #include <linux/pagemap.h>
  31 #include <linux/swap.h>
  32 #include <linux/swapops.h>
  33 #include <linux/slab.h>
  34 #include <linux/init.h>
  35 #include <linux/rmap.h>
  36 #include <linux/vs_memory.h>
  37
  38 #include <asm/tlbflush.h>
  39
  40 //#define RMAP_DEBUG /* can be enabled only for debugging */
  41
  42 kmem_cache_t *anon_vma_cachep;
  43
  44 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  45 {
  46 #ifdef RMAP_DEBUG
  47         struct anon_vma *anon_vma = find_vma->anon_vma;
  48         struct vm_area_struct *vma;
  49         unsigned int mapcount = 0;
  50         int found = 0;
  51
  52         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  53                 mapcount++;
  54                 BUG_ON(mapcount > 100000);
  55                 if (vma == find_vma)
  56                         found = 1;
  57         }
  58         BUG_ON(!found);
  59 #endif
  60 }
  61
  62 /* This must be called under the mmap_sem. */
  63 int anon_vma_prepare(struct vm_area_struct *vma)
  64 {
  65         struct anon_vma *anon_vma = vma->anon_vma;
  66
  67         might_sleep();
  68         if (unlikely(!anon_vma)) {
  69                 struct mm_struct *mm = vma->vm_mm;
  70                 struct anon_vma *allocated = NULL;
  71
  72                 anon_vma = find_mergeable_anon_vma(vma);
  73                 if (!anon_vma) {
  74                         anon_vma = anon_vma_alloc();
  75                         if (unlikely(!anon_vma))
  76                                 return -ENOMEM;
  77                         allocated = anon_vma;
  78                 }
  79
  80                 /* page_table_lock to protect against threads */
  81                 spin_lock(&mm->page_table_lock);
  82                 if (likely(!vma->anon_vma)) {
  83                         vma->anon_vma = anon_vma;
  84                         list_add(&vma->anon_vma_node, &anon_vma->head);
  85                         allocated = NULL;
  86                 }
  87                 spin_unlock(&mm->page_table_lock);
  88                 if (unlikely(allocated))
  89                         anon_vma_free(allocated);
  90         }
  91         return 0;
  92 }
  93
  94 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  95 {
  96         if (!vma->anon_vma) {
  97                 BUG_ON(!next->anon_vma);
  98                 vma->anon_vma = next->anon_vma;
  99                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 100         } else {
 101                 /* if they're both non-null they must be the same */
 102                 BUG_ON(vma->anon_vma != next->anon_vma);
 103         }
 104         list_del(&next->anon_vma_node);
 105 }
 106
 107 void __anon_vma_link(struct vm_area_struct *vma)
 108 {
 109         struct anon_vma *anon_vma = vma->anon_vma;
 110
 111         if (anon_vma) {
 112                 list_add(&vma->anon_vma_node, &anon_vma->head);
 113                 validate_anon_vma(vma);
 114         }
 115 }
 116
 117 void anon_vma_link(struct vm_area_struct *vma)
 118 {
 119         struct anon_vma *anon_vma = vma->anon_vma;
 120
 121         if (anon_vma) {
 122                 spin_lock(&anon_vma->lock);
 123                 list_add(&vma->anon_vma_node, &anon_vma->head);
 124                 validate_anon_vma(vma);
 125                 spin_unlock(&anon_vma->lock);
 126         }
 127 }
 128
 129 void anon_vma_unlink(struct vm_area_struct *vma)
 130 {
 131         struct anon_vma *anon_vma = vma->anon_vma;
 132         int empty;
 133
 134         if (!anon_vma)
 135                 return;
 136
 137         spin_lock(&anon_vma->lock);
 138         validate_anon_vma(vma);
 139         list_del(&vma->anon_vma_node);
 140
 141         /* We must garbage collect the anon_vma if it's empty */
 142         empty = list_empty(&anon_vma->head);
 143         spin_unlock(&anon_vma->lock);
 144
 145         if (empty)
 146                 anon_vma_free(anon_vma);
 147 }
 148
 149 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 150 {
 151         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 152                                                 SLAB_CTOR_CONSTRUCTOR) {
 153                 struct anon_vma *anon_vma = data;
 154
 155                 spin_lock_init(&anon_vma->lock);
 156                 INIT_LIST_HEAD(&anon_vma->head);
 157         }
 158 }
 159
 160 void __init anon_vma_init(void)
 161 {
 162         anon_vma_cachep = kmem_cache_create("anon_vma",
 163                 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 164 }
 165
 166 /* this needs the page->flags PG_maplock held */
 167 static inline void clear_page_anon(struct page *page)
 168 {
 169         BUG_ON(!page->mapping);
 170         page->mapping = NULL;
 171         ClearPageAnon(page);
 172 }
 173
 174 /*
 175  * At what user virtual address is page expected in vma?
 176  */
 177 static inline unsigned long
 178 vma_address(struct page *page, struct vm_area_struct *vma)
 179 {
 180         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 181         unsigned long address;
 182
 183         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 184         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 185                 /* page should be within any vma from prio_tree_next */
 186                 BUG_ON(!PageAnon(page));
 187                 return -EFAULT;
 188         }
 189         return address;
 190 }
 191
 192 /*
 193  * Subfunctions of page_referenced: page_referenced_one called
 194  * repeatedly from either page_referenced_anon or page_referenced_file.
 195  */
 196 static int page_referenced_one(struct page *page,
 197         struct vm_area_struct *vma, unsigned int *mapcount)
 198 {
 199         struct mm_struct *mm = vma->vm_mm;
 200         unsigned long address;
 201         pgd_t *pgd;
 202         pmd_t *pmd;
 203         pte_t *pte;
 204         int referenced = 0;
 205
 206         if (!mm->rss)
 207                 goto out;
 208         address = vma_address(page, vma);
 209         if (address == -EFAULT)
 210                 goto out;
 211
 212         if (!spin_trylock(&mm->page_table_lock))
 213                 goto out;
 214
 215         pgd = pgd_offset(mm, address);
 216         if (!pgd_present(*pgd))
 217                 goto out_unlock;
 218
 219         pmd = pmd_offset(pgd, address);
 220         if (!pmd_present(*pmd))
 221                 goto out_unlock;
 222
 223         pte = pte_offset_map(pmd, address);
 224         if (!pte_present(*pte))
 225                 goto out_unmap;
 226
 227         if (page_to_pfn(page) != pte_pfn(*pte))
 228                 goto out_unmap;
 229
 230         if (ptep_test_and_clear_young(pte))
 231                 referenced++;
 232
 233         (*mapcount)--;
 234
 235 out_unmap:
 236         pte_unmap(pte);
 237 out_unlock:
 238         spin_unlock(&mm->page_table_lock);
 239 out:
 240         return referenced;
 241 }
 242
 243 static inline int page_referenced_anon(struct page *page)
 244 {
 245         unsigned int mapcount = page->mapcount;
 246         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 247         struct vm_area_struct *vma;
 248         int referenced = 0;
 249
 250         spin_lock(&anon_vma->lock);
 251         BUG_ON(list_empty(&anon_vma->head));
 252         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 253                 referenced += page_referenced_one(page, vma, &mapcount);
 254                 if (!mapcount)
 255                         break;
 256         }
 257         spin_unlock(&anon_vma->lock);
 258         return referenced;
 259 }
 260
 261 /**
 262  * page_referenced_file - referenced check for object-based rmap
 263  * @page: the page we're checking references on.
 264  *
 265  * For an object-based mapped page, find all the places it is mapped and
 266  * check/clear the referenced flag.  This is done by following the page->mapping
 267  * pointer, then walking the chain of vmas it holds.  It returns the number
 268  * of references it found.
 269  *
 270  * This function is only called from page_referenced for object-based pages.
 271  *
 272  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 273  * assume a reference count of 0, so try_to_unmap will then have a go.
 274  */
 275 static inline int page_referenced_file(struct page *page)
 276 {
 277         unsigned int mapcount = page->mapcount;
 278         struct address_space *mapping = page->mapping;
 279         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 280         struct vm_area_struct *vma = NULL;
 281         struct prio_tree_iter iter;
 282         int referenced = 0;
 283
 284         if (!spin_trylock(&mapping->i_mmap_lock))
 285                 return 0;
 286
 287         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 288                                         &iter, pgoff, pgoff)) != NULL) {
 289                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 290                                   == (VM_LOCKED|VM_MAYSHARE)) {
 291                         referenced++;
 292                         break;
 293                 }
 294                 referenced += page_referenced_one(page, vma, &mapcount);
 295                 if (!mapcount)
 296                         break;
 297         }
 298
 299         spin_unlock(&mapping->i_mmap_lock);
 300         return referenced;
 301 }
 302
 303 /**
 304  * page_referenced - test if the page was referenced
 305  * @page: the page to test
 306  *
 307  * Quick test_and_clear_referenced for all mappings to a page,
 308  * returns the number of ptes which referenced the page.
 309  * Caller needs to hold the rmap lock.
 310  */
 311 int page_referenced(struct page *page)
 312 {
 313         int referenced = 0;
 314
 315         if (page_test_and_clear_young(page))
 316                 referenced++;
 317
 318         if (TestClearPageReferenced(page))
 319                 referenced++;
 320
 321         if (page->mapcount && page->mapping) {
 322                 if (PageAnon(page))
 323                         referenced += page_referenced_anon(page);
 324                 else
 325                         referenced += page_referenced_file(page);
 326         }
 327         return referenced;
 328 }
 329
 330 /**
 331  * page_add_anon_rmap - add pte mapping to an anonymous page
 332  * @page:       the page to add the mapping to
 333  * @vma:        the vm area in which the mapping is added
 334  * @address:    the user virtual address mapped
 335  *
 336  * The caller needs to hold the mm->page_table_lock.
 337  */
 338 void page_add_anon_rmap(struct page *page,
 339         struct vm_area_struct *vma, unsigned long address)
 340 {
 341         struct anon_vma *anon_vma = vma->anon_vma;
 342         pgoff_t index;
 343
 344         BUG_ON(PageReserved(page));
 345         BUG_ON(!anon_vma);
 346
 347         index = (address - vma->vm_start) >> PAGE_SHIFT;
 348         index += vma->vm_pgoff;
 349         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 350
 351         /*
 352          * Setting and clearing PG_anon must always happen inside
 353          * page_map_lock to avoid races between mapping and
 354          * unmapping on different processes of the same
 355          * shared cow swapcache page. And while we take the
 356          * page_map_lock PG_anon cannot change from under us.
 357          * Actually PG_anon cannot change under fork either
 358          * since fork holds a reference on the page so it cannot
 359          * be unmapped under fork and in turn copy_page_range is
 360          * allowed to read PG_anon outside the page_map_lock.
 361          */
 362         page_map_lock(page);
 363         if (!page->mapcount) {
 364                 BUG_ON(PageAnon(page));
 365                 BUG_ON(page->mapping);
 366                 SetPageAnon(page);
 367                 page->index = index;
 368                 page->mapping = (struct address_space *) anon_vma;
 369                 inc_page_state(nr_mapped);
 370         } else {
 371                 BUG_ON(!PageAnon(page));
 372                 BUG_ON(page->index != index);
 373                 BUG_ON(page->mapping != (struct address_space *) anon_vma);
 374         }
 375         page->mapcount++;
 376         page_map_unlock(page);
 377 }
 378
 379 /**
 380  * page_add_file_rmap - add pte mapping to a file page
 381  * @page: the page to add the mapping to
 382  *
 383  * The caller needs to hold the mm->page_table_lock.
 384  */
 385 void page_add_file_rmap(struct page *page)
 386 {
 387         BUG_ON(PageAnon(page));
 388         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 389                 return;
 390
 391         page_map_lock(page);
 392         if (!page->mapcount)
 393                 inc_page_state(nr_mapped);
 394         page->mapcount++;
 395         page_map_unlock(page);
 396 }
 397
 398 /**
 399  * page_remove_rmap - take down pte mapping from a page
 400  * @page: page to remove mapping from
 401  *
 402  * Caller needs to hold the mm->page_table_lock.
 403  */
 404 void page_remove_rmap(struct page *page)
 405 {
 406         BUG_ON(PageReserved(page));
 407         BUG_ON(!page->mapcount);
 408
 409         page_map_lock(page);
 410         page->mapcount--;
 411         if (!page->mapcount) {
 412                 if (page_test_and_clear_dirty(page))
 413                         set_page_dirty(page);
 414                 if (PageAnon(page))
 415                         clear_page_anon(page);
 416                 dec_page_state(nr_mapped);
 417         }
 418         page_map_unlock(page);
 419 }
 420
 421 /*
 422  * Subfunctions of try_to_unmap: try_to_unmap_one called
 423  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 424  */
 425 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 426 {
 427         struct mm_struct *mm = vma->vm_mm;
 428         unsigned long address;
 429         pgd_t *pgd;
 430         pmd_t *pmd;
 431         pte_t *pte;
 432         pte_t pteval;
 433         int ret = SWAP_AGAIN;
 434
 435         if (!mm->rss)
 436                 goto out;
 437         address = vma_address(page, vma);
 438         if (address == -EFAULT)
 439                 goto out;
 440
 441         /*
 442          * We need the page_table_lock to protect us from page faults,
 443          * munmap, fork, etc...
 444          */
 445         if (!spin_trylock(&mm->page_table_lock))
 446                 goto out;
 447
 448         pgd = pgd_offset(mm, address);
 449         if (!pgd_present(*pgd))
 450                 goto out_unlock;
 451
 452         pmd = pmd_offset(pgd, address);
 453         if (!pmd_present(*pmd))
 454                 goto out_unlock;
 455
 456         pte = pte_offset_map(pmd, address);
 457         if (!pte_present(*pte))
 458                 goto out_unmap;
 459
 460         if (page_to_pfn(page) != pte_pfn(*pte))
 461                 goto out_unmap;
 462
 463         /*
 464          * If the page is mlock()d, we cannot swap it out.
 465          * If it's recently referenced (perhaps page_referenced
 466          * skipped over this mm) then we should reactivate it.
 467          */
 468         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 469                         ptep_test_and_clear_young(pte)) {
 470                 ret = SWAP_FAIL;
 471                 goto out_unmap;
 472         }
 473
 474         /*
 475          * Don't pull an anonymous page out from under get_user_pages.
 476          * GUP carefully breaks COW and raises page count (while holding
 477          * page_table_lock, as we have here) to make sure that the page
 478          * cannot be freed.  If we unmap that page here, a user write
 479          * access to the virtual address will bring back the page, but
 480          * its raised count will (ironically) be taken to mean it's not
 481          * an exclusive swap page, do_wp_page will replace it by a copy
 482          * page, and the user never get to see the data GUP was holding
 483          * the original page for.
 484          */
 485         if (PageSwapCache(page) &&
 486             page_count(page) != page->mapcount + 2) {
 487                 ret = SWAP_FAIL;
 488                 goto out_unmap;
 489         }
 490
 491         /* Nuke the page table entry. */
 492         flush_cache_page(vma, address);
 493         pteval = ptep_clear_flush(vma, address, pte);
 494
 495         /* Move the dirty bit to the physical page now the pte is gone. */
 496         if (pte_dirty(pteval))
 497                 set_page_dirty(page);
 498
 499         if (PageAnon(page)) {
 500                 swp_entry_t entry = { .val = page->private };
 501                 /*
 502                  * Store the swap location in the pte.
 503                  * See handle_pte_fault() ...
 504                  */
 505                 BUG_ON(!PageSwapCache(page));
 506                 swap_duplicate(entry);
 507                 set_pte(pte, swp_entry_to_pte(entry));
 508                 BUG_ON(pte_file(*pte));
 509         }
 510
 511         // mm->rss--;
 512         vx_rsspages_dec(mm);
 513         BUG_ON(!page->mapcount);
 514         page->mapcount--;
 515         page_cache_release(page);
 516
 517 out_unmap:
 518         pte_unmap(pte);
 519 out_unlock:
 520         spin_unlock(&mm->page_table_lock);
 521 out:
 522         return ret;
 523 }
 524
 525 /*
 526  * objrmap doesn't work for nonlinear VMAs because the assumption that
 527  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 528  * Consequently, given a particular page and its ->index, we cannot locate the
 529  * ptes which are mapping that page without an exhaustive linear search.
 530  *
 531  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 532  * maps the file to which the target page belongs.  The ->vm_private_data field
 533  * holds the current cursor into that scan.  Successive searches will circulate
 534  * around the vma's virtual address space.
 535  *
 536  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 537  * more scanning pressure is placed against them as well.   Eventually pages
 538  * will become fully unmapped and are eligible for eviction.
 539  *
 540  * For very sparsely populated VMAs this is a little inefficient - chances are
 541  * there there won't be many ptes located within the scan cluster.  In this case
 542  * maybe we could scan further - to the end of the pte page, perhaps.
 543  */
 544 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 545 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 546
 547 static int try_to_unmap_cluster(unsigned long cursor,
 548         unsigned int *mapcount, struct vm_area_struct *vma)
 549 {
 550         struct mm_struct *mm = vma->vm_mm;
 551         pgd_t *pgd;
 552         pmd_t *pmd;
 553         pte_t *pte;
 554         pte_t pteval;
 555         struct page *page;
 556         unsigned long address;
 557         unsigned long end;
 558         unsigned long pfn;
 559
 560         /*
 561          * We need the page_table_lock to protect us from page faults,
 562          * munmap, fork, etc...
 563          */
 564         if (!spin_trylock(&mm->page_table_lock))
 565                 return SWAP_FAIL;
 566
 567         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 568         end = address + CLUSTER_SIZE;
 569         if (address < vma->vm_start)
 570                 address = vma->vm_start;
 571         if (end > vma->vm_end)
 572                 end = vma->vm_end;
 573
 574         pgd = pgd_offset(mm, address);
 575         if (!pgd_present(*pgd))
 576                 goto out_unlock;
 577
 578         pmd = pmd_offset(pgd, address);
 579         if (!pmd_present(*pmd))
 580                 goto out_unlock;
 581
 582         for (pte = pte_offset_map(pmd, address);
 583                         address < end; pte++, address += PAGE_SIZE) {
 584
 585                 if (!pte_present(*pte))
 586                         continue;
 587
 588                 pfn = pte_pfn(*pte);
 589                 if (!pfn_valid(pfn))
 590                         continue;
 591
 592                 page = pfn_to_page(pfn);
 593                 BUG_ON(PageAnon(page));
 594                 if (PageReserved(page))
 595                         continue;
 596
 597                 if (ptep_test_and_clear_young(pte))
 598                         continue;
 599
 600                 /* Nuke the page table entry. */
 601                 flush_cache_page(vma, address);
 602                 pteval = ptep_clear_flush(vma, address, pte);
 603
 604                 /* If nonlinear, store the file page offset in the pte. */
 605                 if (page->index != linear_page_index(vma, address))
 606                         set_pte(pte, pgoff_to_pte(page->index));
 607
 608                 /* Move the dirty bit to the physical page now the pte is gone. */
 609                 if (pte_dirty(pteval))
 610                         set_page_dirty(page);
 611
 612                 page_remove_rmap(page);
 613                 page_cache_release(page);
 614                 // mm->rss--;
 615                 vx_rsspages_dec(mm);
 616                 (*mapcount)--;
 617         }
 618
 619         pte_unmap(pte);
 620
 621 out_unlock:
 622         spin_unlock(&mm->page_table_lock);
 623         return SWAP_AGAIN;
 624 }
 625
 626 static inline int try_to_unmap_anon(struct page *page)
 627 {
 628         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 629         struct vm_area_struct *vma;
 630         int ret = SWAP_AGAIN;
 631
 632         spin_lock(&anon_vma->lock);
 633         BUG_ON(list_empty(&anon_vma->head));
 634         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 635                 ret = try_to_unmap_one(page, vma);
 636                 if (ret == SWAP_FAIL || !page->mapcount)
 637                         break;
 638         }
 639         spin_unlock(&anon_vma->lock);
 640         return ret;
 641 }
 642
 643 /**
 644  * try_to_unmap_file - unmap file page using the object-based rmap method
 645  * @page: the page to unmap
 646  *
 647  * Find all the mappings of a page using the mapping pointer and the vma chains
 648  * contained in the address_space struct it points to.
 649  *
 650  * This function is only called from try_to_unmap for object-based pages.
 651  *
 652  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 653  * return a temporary error.
 654  */
 655 static inline int try_to_unmap_file(struct page *page)
 656 {
 657         struct address_space *mapping = page->mapping;
 658         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 659         struct vm_area_struct *vma = NULL;
 660         struct prio_tree_iter iter;
 661         int ret = SWAP_AGAIN;
 662         unsigned long cursor;
 663         unsigned long max_nl_cursor = 0;
 664         unsigned long max_nl_size = 0;
 665         unsigned int mapcount;
 666
 667         if (!spin_trylock(&mapping->i_mmap_lock))
 668                 return ret;
 669
 670         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 671                                         &iter, pgoff, pgoff)) != NULL) {
 672                 ret = try_to_unmap_one(page, vma);
 673                 if (ret == SWAP_FAIL || !page->mapcount)
 674                         goto out;
 675         }
 676
 677         if (list_empty(&mapping->i_mmap_nonlinear))
 678                 goto out;
 679
 680         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 681                                                 shared.vm_set.list) {
 682                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 683                         continue;
 684                 cursor = (unsigned long) vma->vm_private_data;
 685                 if (cursor > max_nl_cursor)
 686                         max_nl_cursor = cursor;
 687                 cursor = vma->vm_end - vma->vm_start;
 688                 if (cursor > max_nl_size)
 689                         max_nl_size = cursor;
 690         }
 691
 692         if (max_nl_size == 0)   /* any nonlinears locked or reserved */
 693                 goto out;
 694
 695         /*
 696          * We don't try to search for this page in the nonlinear vmas,
 697          * and page_referenced wouldn't have found it anyway.  Instead
 698          * just walk the nonlinear vmas trying to age and unmap some.
 699          * The mapcount of the page we came in with is irrelevant,
 700          * but even so use it as a guide to how hard we should try?
 701          */
 702         mapcount = page->mapcount;
 703         page_map_unlock(page);
 704         cond_resched_lock(&mapping->i_mmap_lock);
 705
 706         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 707         if (max_nl_cursor == 0)
 708                 max_nl_cursor = CLUSTER_SIZE;
 709
 710         do {
 711                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 712                                                 shared.vm_set.list) {
 713                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 714                                 continue;
 715                         cursor = (unsigned long) vma->vm_private_data;
 716                         while (vma->vm_mm->rss &&
 717                                 cursor < max_nl_cursor &&
 718                                 cursor < vma->vm_end - vma->vm_start) {
 719                                 ret = try_to_unmap_cluster(
 720                                                 cursor, &mapcount, vma);
 721                                 if (ret == SWAP_FAIL)
 722                                         break;
 723                                 cursor += CLUSTER_SIZE;
 724                                 vma->vm_private_data = (void *) cursor;
 725                                 if ((int)mapcount <= 0)
 726                                         goto relock;
 727                         }
 728                         if (ret != SWAP_FAIL)
 729                                 vma->vm_private_data =
 730                                         (void *) max_nl_cursor;
 731                         ret = SWAP_AGAIN;
 732                 }
 733                 cond_resched_lock(&mapping->i_mmap_lock);
 734                 max_nl_cursor += CLUSTER_SIZE;
 735         } while (max_nl_cursor <= max_nl_size);
 736
 737         /*
 738          * Don't loop forever (perhaps all the remaining pages are
 739          * in locked vmas).  Reset cursor on all unreserved nonlinear
 740          * vmas, now forgetting on which ones it had fallen behind.
 741          */
 742         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 743                                                 shared.vm_set.list) {
 744                 if (!(vma->vm_flags & VM_RESERVED))
 745                         vma->vm_private_data = 0;
 746         }
 747 relock:
 748         page_map_lock(page);
 749 out:
 750         spin_unlock(&mapping->i_mmap_lock);
 751         return ret;
 752 }
 753
 754 /**
 755  * try_to_unmap - try to remove all page table mappings to a page
 756  * @page: the page to get unmapped
 757  *
 758  * Tries to remove all the page table entries which are mapping this
 759  * page, used in the pageout path.  Caller must hold the page lock
 760  * and its rmap lock.  Return values are:
 761  *
 762  * SWAP_SUCCESS - we succeeded in removing all mappings
 763  * SWAP_AGAIN   - we missed a trylock, try again later
 764  * SWAP_FAIL    - the page is unswappable
 765  */
 766 int try_to_unmap(struct page *page)
 767 {
 768         int ret;
 769
 770         BUG_ON(PageReserved(page));
 771         BUG_ON(!PageLocked(page));
 772         BUG_ON(!page->mapcount);
 773
 774         if (PageAnon(page))
 775                 ret = try_to_unmap_anon(page);
 776         else
 777                 ret = try_to_unmap_file(page);
 778
 779         if (!page->mapcount) {
 780                 if (page_test_and_clear_dirty(page))
 781                         set_page_dirty(page);
 782                 if (PageAnon(page))
 783                         clear_page_anon(page);
 784                 dec_page_state(nr_mapped);
 785                 ret = SWAP_SUCCESS;
 786         }
 787         return ret;
 788 }