mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Locking: see "Lock ordering" summary in filemap.c.
  22  * In swapout, page_map_lock is held on entry to page_referenced and
  23  * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
  24  */
  25
  26 #include <linux/mm.h>
  27 #include <linux/pagemap.h>
  28 #include <linux/swap.h>
  29 #include <linux/swapops.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/rmap.h>
  33 #include <linux/vs_memory.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 //#define RMAP_DEBUG /* can be enabled only for debugging */
  38
  39 kmem_cache_t *anon_vma_cachep;
  40
  41 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  42 {
  43 #ifdef RMAP_DEBUG
  44         struct anon_vma *anon_vma = find_vma->anon_vma;
  45         struct vm_area_struct *vma;
  46         unsigned int mapcount = 0;
  47         int found = 0;
  48
  49         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  50                 mapcount++;
  51                 BUG_ON(mapcount > 100000);
  52                 if (vma == find_vma)
  53                         found = 1;
  54         }
  55         BUG_ON(!found);
  56 #endif
  57 }
  58
  59 /* This must be called under the mmap_sem. */
  60 int anon_vma_prepare(struct vm_area_struct *vma)
  61 {
  62         struct anon_vma *anon_vma = vma->anon_vma;
  63
  64         might_sleep();
  65         if (unlikely(!anon_vma)) {
  66                 struct mm_struct *mm = vma->vm_mm;
  67                 struct anon_vma *allocated = NULL;
  68
  69                 anon_vma = find_mergeable_anon_vma(vma);
  70                 if (!anon_vma) {
  71                         anon_vma = anon_vma_alloc();
  72                         if (unlikely(!anon_vma))
  73                                 return -ENOMEM;
  74                         allocated = anon_vma;
  75                 }
  76
  77                 /* page_table_lock to protect against threads */
  78                 spin_lock(&mm->page_table_lock);
  79                 if (likely(!vma->anon_vma)) {
  80                         if (!allocated)
  81                                 spin_lock(&anon_vma->lock);
  82                         vma->anon_vma = anon_vma;
  83                         list_add(&vma->anon_vma_node, &anon_vma->head);
  84                         if (!allocated)
  85                                 spin_unlock(&anon_vma->lock);
  86                         allocated = NULL;
  87                 }
  88                 spin_unlock(&mm->page_table_lock);
  89                 if (unlikely(allocated))
  90                         anon_vma_free(allocated);
  91         }
  92         return 0;
  93 }
  94
  95 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  96 {
  97         if (!vma->anon_vma) {
  98                 BUG_ON(!next->anon_vma);
  99                 vma->anon_vma = next->anon_vma;
 100                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 101         } else {
 102                 /* if they're both non-null they must be the same */
 103                 BUG_ON(vma->anon_vma != next->anon_vma);
 104         }
 105         list_del(&next->anon_vma_node);
 106 }
 107
 108 void __anon_vma_link(struct vm_area_struct *vma)
 109 {
 110         struct anon_vma *anon_vma = vma->anon_vma;
 111
 112         if (anon_vma) {
 113                 list_add(&vma->anon_vma_node, &anon_vma->head);
 114                 validate_anon_vma(vma);
 115         }
 116 }
 117
 118 void anon_vma_link(struct vm_area_struct *vma)
 119 {
 120         struct anon_vma *anon_vma = vma->anon_vma;
 121
 122         if (anon_vma) {
 123                 spin_lock(&anon_vma->lock);
 124                 list_add(&vma->anon_vma_node, &anon_vma->head);
 125                 validate_anon_vma(vma);
 126                 spin_unlock(&anon_vma->lock);
 127         }
 128 }
 129
 130 void anon_vma_unlink(struct vm_area_struct *vma)
 131 {
 132         struct anon_vma *anon_vma = vma->anon_vma;
 133         int empty;
 134
 135         if (!anon_vma)
 136                 return;
 137
 138         spin_lock(&anon_vma->lock);
 139         validate_anon_vma(vma);
 140         list_del(&vma->anon_vma_node);
 141
 142         /* We must garbage collect the anon_vma if it's empty */
 143         empty = list_empty(&anon_vma->head);
 144         spin_unlock(&anon_vma->lock);
 145
 146         if (empty)
 147                 anon_vma_free(anon_vma);
 148 }
 149
 150 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 151 {
 152         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 153                                                 SLAB_CTOR_CONSTRUCTOR) {
 154                 struct anon_vma *anon_vma = data;
 155
 156                 spin_lock_init(&anon_vma->lock);
 157                 INIT_LIST_HEAD(&anon_vma->head);
 158         }
 159 }
 160
 161 void __init anon_vma_init(void)
 162 {
 163         anon_vma_cachep = kmem_cache_create("anon_vma",
 164                 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 165 }
 166
 167 /* this needs the page->flags PG_maplock held */
 168 static inline void clear_page_anon(struct page *page)
 169 {
 170         BUG_ON(!page->mapping);
 171         page->mapping = NULL;
 172         ClearPageAnon(page);
 173 }
 174
 175 /*
 176  * At what user virtual address is page expected in vma?
 177  */
 178 static inline unsigned long
 179 vma_address(struct page *page, struct vm_area_struct *vma)
 180 {
 181         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 182         unsigned long address;
 183
 184         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 185         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 186                 /* page should be within any vma from prio_tree_next */
 187                 BUG_ON(!PageAnon(page));
 188                 return -EFAULT;
 189         }
 190         return address;
 191 }
 192
 193 /*
 194  * Subfunctions of page_referenced: page_referenced_one called
 195  * repeatedly from either page_referenced_anon or page_referenced_file.
 196  */
 197 static int page_referenced_one(struct page *page,
 198         struct vm_area_struct *vma, unsigned int *mapcount)
 199 {
 200         struct mm_struct *mm = vma->vm_mm;
 201         unsigned long address;
 202         pgd_t *pgd;
 203         pmd_t *pmd;
 204         pte_t *pte;
 205         int referenced = 0;
 206
 207         if (!mm->rss)
 208                 goto out;
 209         address = vma_address(page, vma);
 210         if (address == -EFAULT)
 211                 goto out;
 212
 213         if (!spin_trylock(&mm->page_table_lock))
 214                 goto out;
 215
 216         pgd = pgd_offset(mm, address);
 217         if (!pgd_present(*pgd))
 218                 goto out_unlock;
 219
 220         pmd = pmd_offset(pgd, address);
 221         if (!pmd_present(*pmd))
 222                 goto out_unlock;
 223
 224         pte = pte_offset_map(pmd, address);
 225         if (!pte_present(*pte))
 226                 goto out_unmap;
 227
 228         if (page_to_pfn(page) != pte_pfn(*pte))
 229                 goto out_unmap;
 230
 231         if (ptep_clear_flush_young(vma, address, pte))
 232                 referenced++;
 233
 234         (*mapcount)--;
 235
 236 out_unmap:
 237         pte_unmap(pte);
 238 out_unlock:
 239         spin_unlock(&mm->page_table_lock);
 240 out:
 241         return referenced;
 242 }
 243
 244 static inline int page_referenced_anon(struct page *page)
 245 {
 246         unsigned int mapcount = page->mapcount;
 247         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 248         struct vm_area_struct *vma;
 249         int referenced = 0;
 250
 251         spin_lock(&anon_vma->lock);
 252         BUG_ON(list_empty(&anon_vma->head));
 253         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 254                 referenced += page_referenced_one(page, vma, &mapcount);
 255                 if (!mapcount)
 256                         break;
 257         }
 258         spin_unlock(&anon_vma->lock);
 259         return referenced;
 260 }
 261
 262 /**
 263  * page_referenced_file - referenced check for object-based rmap
 264  * @page: the page we're checking references on.
 265  *
 266  * For an object-based mapped page, find all the places it is mapped and
 267  * check/clear the referenced flag.  This is done by following the page->mapping
 268  * pointer, then walking the chain of vmas it holds.  It returns the number
 269  * of references it found.
 270  *
 271  * This function is only called from page_referenced for object-based pages.
 272  *
 273  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 274  * assume a reference count of 0, so try_to_unmap will then have a go.
 275  */
 276 static inline int page_referenced_file(struct page *page)
 277 {
 278         unsigned int mapcount = page->mapcount;
 279         struct address_space *mapping = page->mapping;
 280         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 281         struct vm_area_struct *vma = NULL;
 282         struct prio_tree_iter iter;
 283         int referenced = 0;
 284
 285         if (!spin_trylock(&mapping->i_mmap_lock))
 286                 return 0;
 287
 288         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 289                                         &iter, pgoff, pgoff)) != NULL) {
 290                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 291                                   == (VM_LOCKED|VM_MAYSHARE)) {
 292                         referenced++;
 293                         break;
 294                 }
 295                 referenced += page_referenced_one(page, vma, &mapcount);
 296                 if (!mapcount)
 297                         break;
 298         }
 299
 300         spin_unlock(&mapping->i_mmap_lock);
 301         return referenced;
 302 }
 303
 304 /**
 305  * page_referenced - test if the page was referenced
 306  * @page: the page to test
 307  *
 308  * Quick test_and_clear_referenced for all mappings to a page,
 309  * returns the number of ptes which referenced the page.
 310  * Caller needs to hold the rmap lock.
 311  */
 312 int page_referenced(struct page *page)
 313 {
 314         int referenced = 0;
 315
 316         if (page_test_and_clear_young(page))
 317                 referenced++;
 318
 319         if (TestClearPageReferenced(page))
 320                 referenced++;
 321
 322         if (page->mapcount && page->mapping) {
 323                 if (PageAnon(page))
 324                         referenced += page_referenced_anon(page);
 325                 else
 326                         referenced += page_referenced_file(page);
 327         }
 328         return referenced;
 329 }
 330
 331 /**
 332  * page_add_anon_rmap - add pte mapping to an anonymous page
 333  * @page:       the page to add the mapping to
 334  * @vma:        the vm area in which the mapping is added
 335  * @address:    the user virtual address mapped
 336  *
 337  * The caller needs to hold the mm->page_table_lock.
 338  */
 339 void page_add_anon_rmap(struct page *page,
 340         struct vm_area_struct *vma, unsigned long address)
 341 {
 342         struct anon_vma *anon_vma = vma->anon_vma;
 343         pgoff_t index;
 344
 345         BUG_ON(PageReserved(page));
 346         BUG_ON(!anon_vma);
 347
 348         index = (address - vma->vm_start) >> PAGE_SHIFT;
 349         index += vma->vm_pgoff;
 350         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 351
 352         /*
 353          * Setting and clearing PG_anon must always happen inside
 354          * page_map_lock to avoid races between mapping and
 355          * unmapping on different processes of the same
 356          * shared cow swapcache page. And while we take the
 357          * page_map_lock PG_anon cannot change from under us.
 358          * Actually PG_anon cannot change under fork either
 359          * since fork holds a reference on the page so it cannot
 360          * be unmapped under fork and in turn copy_page_range is
 361          * allowed to read PG_anon outside the page_map_lock.
 362          */
 363         page_map_lock(page);
 364         if (!page->mapcount) {
 365                 BUG_ON(PageAnon(page));
 366                 BUG_ON(page->mapping);
 367                 SetPageAnon(page);
 368                 page->index = index;
 369                 page->mapping = (struct address_space *) anon_vma;
 370                 inc_page_state(nr_mapped);
 371         } else {
 372                 BUG_ON(!PageAnon(page));
 373                 BUG_ON(page->index != index);
 374                 BUG_ON(page->mapping != (struct address_space *) anon_vma);
 375         }
 376         page->mapcount++;
 377         page_map_unlock(page);
 378 }
 379
 380 /**
 381  * page_add_file_rmap - add pte mapping to a file page
 382  * @page: the page to add the mapping to
 383  *
 384  * The caller needs to hold the mm->page_table_lock.
 385  */
 386 void page_add_file_rmap(struct page *page)
 387 {
 388         BUG_ON(PageAnon(page));
 389         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 390                 return;
 391
 392         page_map_lock(page);
 393         if (!page->mapcount)
 394                 inc_page_state(nr_mapped);
 395         page->mapcount++;
 396         page_map_unlock(page);
 397 }
 398
 399 /**
 400  * page_remove_rmap - take down pte mapping from a page
 401  * @page: page to remove mapping from
 402  *
 403  * Caller needs to hold the mm->page_table_lock.
 404  */
 405 void page_remove_rmap(struct page *page)
 406 {
 407         BUG_ON(PageReserved(page));
 408         BUG_ON(!page->mapcount);
 409
 410         page_map_lock(page);
 411         page->mapcount--;
 412         if (!page->mapcount) {
 413                 if (page_test_and_clear_dirty(page))
 414                         set_page_dirty(page);
 415                 if (PageAnon(page))
 416                         clear_page_anon(page);
 417                 dec_page_state(nr_mapped);
 418         }
 419         page_map_unlock(page);
 420 }
 421
 422 /*
 423  * Subfunctions of try_to_unmap: try_to_unmap_one called
 424  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 425  */
 426 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 427 {
 428         struct mm_struct *mm = vma->vm_mm;
 429         unsigned long address;
 430         pgd_t *pgd;
 431         pmd_t *pmd;
 432         pte_t *pte;
 433         pte_t pteval;
 434         int ret = SWAP_AGAIN;
 435
 436         if (!mm->rss)
 437                 goto out;
 438         address = vma_address(page, vma);
 439         if (address == -EFAULT)
 440                 goto out;
 441
 442         /*
 443          * We need the page_table_lock to protect us from page faults,
 444          * munmap, fork, etc...
 445          */
 446         if (!spin_trylock(&mm->page_table_lock))
 447                 goto out;
 448
 449         pgd = pgd_offset(mm, address);
 450         if (!pgd_present(*pgd))
 451                 goto out_unlock;
 452
 453         pmd = pmd_offset(pgd, address);
 454         if (!pmd_present(*pmd))
 455                 goto out_unlock;
 456
 457         pte = pte_offset_map(pmd, address);
 458         if (!pte_present(*pte))
 459                 goto out_unmap;
 460
 461         if (page_to_pfn(page) != pte_pfn(*pte))
 462                 goto out_unmap;
 463
 464         /*
 465          * If the page is mlock()d, we cannot swap it out.
 466          * If it's recently referenced (perhaps page_referenced
 467          * skipped over this mm) then we should reactivate it.
 468          */
 469         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 470                         ptep_clear_flush_young(vma, address, pte)) {
 471                 ret = SWAP_FAIL;
 472                 goto out_unmap;
 473         }
 474
 475         /*
 476          * Don't pull an anonymous page out from under get_user_pages.
 477          * GUP carefully breaks COW and raises page count (while holding
 478          * page_table_lock, as we have here) to make sure that the page
 479          * cannot be freed.  If we unmap that page here, a user write
 480          * access to the virtual address will bring back the page, but
 481          * its raised count will (ironically) be taken to mean it's not
 482          * an exclusive swap page, do_wp_page will replace it by a copy
 483          * page, and the user never get to see the data GUP was holding
 484          * the original page for.
 485          *
 486          * This test is also useful for when swapoff (unuse_process) has
 487          * to drop page lock: its reference to the page stops existing
 488          * ptes from being unmapped, so swapoff can make progress.
 489          */
 490         if (PageSwapCache(page) &&
 491             page_count(page) != page->mapcount + 2) {
 492                 ret = SWAP_FAIL;
 493                 goto out_unmap;
 494         }
 495
 496         /* Nuke the page table entry. */
 497         flush_cache_page(vma, address);
 498         pteval = ptep_clear_flush(vma, address, pte);
 499
 500         /* Move the dirty bit to the physical page now the pte is gone. */
 501         if (pte_dirty(pteval))
 502                 set_page_dirty(page);
 503
 504         if (PageAnon(page)) {
 505                 swp_entry_t entry = { .val = page->private };
 506                 /*
 507                  * Store the swap location in the pte.
 508                  * See handle_pte_fault() ...
 509                  */
 510                 BUG_ON(!PageSwapCache(page));
 511                 swap_duplicate(entry);
 512                 set_pte(pte, swp_entry_to_pte(entry));
 513                 BUG_ON(pte_file(*pte));
 514         }
 515
 516         // mm->rss--;
 517         vx_rsspages_dec(mm);
 518         BUG_ON(!page->mapcount);
 519         page->mapcount--;
 520         page_cache_release(page);
 521
 522 out_unmap:
 523         pte_unmap(pte);
 524 out_unlock:
 525         spin_unlock(&mm->page_table_lock);
 526 out:
 527         return ret;
 528 }
 529
 530 /*
 531  * objrmap doesn't work for nonlinear VMAs because the assumption that
 532  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 533  * Consequently, given a particular page and its ->index, we cannot locate the
 534  * ptes which are mapping that page without an exhaustive linear search.
 535  *
 536  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 537  * maps the file to which the target page belongs.  The ->vm_private_data field
 538  * holds the current cursor into that scan.  Successive searches will circulate
 539  * around the vma's virtual address space.
 540  *
 541  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 542  * more scanning pressure is placed against them as well.   Eventually pages
 543  * will become fully unmapped and are eligible for eviction.
 544  *
 545  * For very sparsely populated VMAs this is a little inefficient - chances are
 546  * there there won't be many ptes located within the scan cluster.  In this case
 547  * maybe we could scan further - to the end of the pte page, perhaps.
 548  */
 549 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 550 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 551
 552 static int try_to_unmap_cluster(unsigned long cursor,
 553         unsigned int *mapcount, struct vm_area_struct *vma)
 554 {
 555         struct mm_struct *mm = vma->vm_mm;
 556         pgd_t *pgd;
 557         pmd_t *pmd;
 558         pte_t *pte;
 559         pte_t pteval;
 560         struct page *page;
 561         unsigned long address;
 562         unsigned long end;
 563         unsigned long pfn;
 564
 565         /*
 566          * We need the page_table_lock to protect us from page faults,
 567          * munmap, fork, etc...
 568          */
 569         if (!spin_trylock(&mm->page_table_lock))
 570                 return SWAP_FAIL;
 571
 572         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 573         end = address + CLUSTER_SIZE;
 574         if (address < vma->vm_start)
 575                 address = vma->vm_start;
 576         if (end > vma->vm_end)
 577                 end = vma->vm_end;
 578
 579         pgd = pgd_offset(mm, address);
 580         if (!pgd_present(*pgd))
 581                 goto out_unlock;
 582
 583         pmd = pmd_offset(pgd, address);
 584         if (!pmd_present(*pmd))
 585                 goto out_unlock;
 586
 587         for (pte = pte_offset_map(pmd, address);
 588                         address < end; pte++, address += PAGE_SIZE) {
 589
 590                 if (!pte_present(*pte))
 591                         continue;
 592
 593                 pfn = pte_pfn(*pte);
 594                 if (!pfn_valid(pfn))
 595                         continue;
 596
 597                 page = pfn_to_page(pfn);
 598                 BUG_ON(PageAnon(page));
 599                 if (PageReserved(page))
 600                         continue;
 601
 602                 if (ptep_clear_flush_young(vma, address, pte))
 603                         continue;
 604
 605                 /* Nuke the page table entry. */
 606                 flush_cache_page(vma, address);
 607                 pteval = ptep_clear_flush(vma, address, pte);
 608
 609                 /* If nonlinear, store the file page offset in the pte. */
 610                 if (page->index != linear_page_index(vma, address))
 611                         set_pte(pte, pgoff_to_pte(page->index));
 612
 613                 /* Move the dirty bit to the physical page now the pte is gone. */
 614                 if (pte_dirty(pteval))
 615                         set_page_dirty(page);
 616
 617                 page_remove_rmap(page);
 618                 page_cache_release(page);
 619                 // mm->rss--;
 620                 vx_rsspages_dec(mm);
 621                 (*mapcount)--;
 622         }
 623
 624         pte_unmap(pte);
 625
 626 out_unlock:
 627         spin_unlock(&mm->page_table_lock);
 628         return SWAP_AGAIN;
 629 }
 630
 631 static inline int try_to_unmap_anon(struct page *page)
 632 {
 633         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 634         struct vm_area_struct *vma;
 635         int ret = SWAP_AGAIN;
 636
 637         spin_lock(&anon_vma->lock);
 638         BUG_ON(list_empty(&anon_vma->head));
 639         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 640                 ret = try_to_unmap_one(page, vma);
 641                 if (ret == SWAP_FAIL || !page->mapcount)
 642                         break;
 643         }
 644         spin_unlock(&anon_vma->lock);
 645         return ret;
 646 }
 647
 648 /**
 649  * try_to_unmap_file - unmap file page using the object-based rmap method
 650  * @page: the page to unmap
 651  *
 652  * Find all the mappings of a page using the mapping pointer and the vma chains
 653  * contained in the address_space struct it points to.
 654  *
 655  * This function is only called from try_to_unmap for object-based pages.
 656  *
 657  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 658  * return a temporary error.
 659  */
 660 static inline int try_to_unmap_file(struct page *page)
 661 {
 662         struct address_space *mapping = page->mapping;
 663         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 664         struct vm_area_struct *vma = NULL;
 665         struct prio_tree_iter iter;
 666         int ret = SWAP_AGAIN;
 667         unsigned long cursor;
 668         unsigned long max_nl_cursor = 0;
 669         unsigned long max_nl_size = 0;
 670         unsigned int mapcount;
 671
 672         if (!spin_trylock(&mapping->i_mmap_lock))
 673                 return ret;
 674
 675         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 676                                         &iter, pgoff, pgoff)) != NULL) {
 677                 ret = try_to_unmap_one(page, vma);
 678                 if (ret == SWAP_FAIL || !page->mapcount)
 679                         goto out;
 680         }
 681
 682         if (list_empty(&mapping->i_mmap_nonlinear))
 683                 goto out;
 684
 685         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 686                                                 shared.vm_set.list) {
 687                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 688                         continue;
 689                 cursor = (unsigned long) vma->vm_private_data;
 690                 if (cursor > max_nl_cursor)
 691                         max_nl_cursor = cursor;
 692                 cursor = vma->vm_end - vma->vm_start;
 693                 if (cursor > max_nl_size)
 694                         max_nl_size = cursor;
 695         }
 696
 697         if (max_nl_size == 0)   /* any nonlinears locked or reserved */
 698                 goto out;
 699
 700         /*
 701          * We don't try to search for this page in the nonlinear vmas,
 702          * and page_referenced wouldn't have found it anyway.  Instead
 703          * just walk the nonlinear vmas trying to age and unmap some.
 704          * The mapcount of the page we came in with is irrelevant,
 705          * but even so use it as a guide to how hard we should try?
 706          */
 707         mapcount = page->mapcount;
 708         page_map_unlock(page);
 709         cond_resched_lock(&mapping->i_mmap_lock);
 710
 711         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 712         if (max_nl_cursor == 0)
 713                 max_nl_cursor = CLUSTER_SIZE;
 714
 715         do {
 716                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 717                                                 shared.vm_set.list) {
 718                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 719                                 continue;
 720                         cursor = (unsigned long) vma->vm_private_data;
 721                         while (vma->vm_mm->rss &&
 722                                 cursor < max_nl_cursor &&
 723                                 cursor < vma->vm_end - vma->vm_start) {
 724                                 ret = try_to_unmap_cluster(
 725                                                 cursor, &mapcount, vma);
 726                                 if (ret == SWAP_FAIL)
 727                                         break;
 728                                 cursor += CLUSTER_SIZE;
 729                                 vma->vm_private_data = (void *) cursor;
 730                                 if ((int)mapcount <= 0)
 731                                         goto relock;
 732                         }
 733                         if (ret != SWAP_FAIL)
 734                                 vma->vm_private_data =
 735                                         (void *) max_nl_cursor;
 736                         ret = SWAP_AGAIN;
 737                 }
 738                 cond_resched_lock(&mapping->i_mmap_lock);
 739                 max_nl_cursor += CLUSTER_SIZE;
 740         } while (max_nl_cursor <= max_nl_size);
 741
 742         /*
 743          * Don't loop forever (perhaps all the remaining pages are
 744          * in locked vmas).  Reset cursor on all unreserved nonlinear
 745          * vmas, now forgetting on which ones it had fallen behind.
 746          */
 747         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 748                                                 shared.vm_set.list) {
 749                 if (!(vma->vm_flags & VM_RESERVED))
 750                         vma->vm_private_data = NULL;
 751         }
 752 relock:
 753         page_map_lock(page);
 754 out:
 755         spin_unlock(&mapping->i_mmap_lock);
 756         return ret;
 757 }
 758
 759 /**
 760  * try_to_unmap - try to remove all page table mappings to a page
 761  * @page: the page to get unmapped
 762  *
 763  * Tries to remove all the page table entries which are mapping this
 764  * page, used in the pageout path.  Caller must hold the page lock
 765  * and its rmap lock.  Return values are:
 766  *
 767  * SWAP_SUCCESS - we succeeded in removing all mappings
 768  * SWAP_AGAIN   - we missed a trylock, try again later
 769  * SWAP_FAIL    - the page is unswappable
 770  */
 771 int try_to_unmap(struct page *page)
 772 {
 773         int ret;
 774
 775         BUG_ON(PageReserved(page));
 776         BUG_ON(!PageLocked(page));
 777         BUG_ON(!page->mapcount);
 778
 779         if (PageAnon(page))
 780                 ret = try_to_unmap_anon(page);
 781         else
 782                 ret = try_to_unmap_file(page);
 783
 784         if (!page->mapcount) {
 785                 if (page_test_and_clear_dirty(page))
 786                         set_page_dirty(page);
 787                 if (PageAnon(page))
 788                         clear_page_anon(page);
 789                 dec_page_state(nr_mapped);
 790                 ret = SWAP_SUCCESS;
 791         }
 792         return ret;
 793 }