mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Locking: see "Lock ordering" summary in filemap.c.
  22  * In swapout, page_map_lock is held on entry to page_referenced and
  23  * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
  24  */
  25
  26 #include <linux/mm.h>
  27 #include <linux/pagemap.h>
  28 #include <linux/swap.h>
  29 #include <linux/swapops.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/rmap.h>
  33 #include <linux/vs_memory.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 //#define RMAP_DEBUG /* can be enabled only for debugging */
  38
  39 kmem_cache_t *anon_vma_cachep;
  40
  41 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  42 {
  43 #ifdef RMAP_DEBUG
  44         struct anon_vma *anon_vma = find_vma->anon_vma;
  45         struct vm_area_struct *vma;
  46         unsigned int mapcount = 0;
  47         int found = 0;
  48
  49         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  50                 mapcount++;
  51                 BUG_ON(mapcount > 100000);
  52                 if (vma == find_vma)
  53                         found = 1;
  54         }
  55         BUG_ON(!found);
  56 #endif
  57 }
  58
  59 /* This must be called under the mmap_sem. */
  60 int anon_vma_prepare(struct vm_area_struct *vma)
  61 {
  62         struct anon_vma *anon_vma = vma->anon_vma;
  63
  64         might_sleep();
  65         if (unlikely(!anon_vma)) {
  66                 struct mm_struct *mm = vma->vm_mm;
  67                 struct anon_vma *allocated = NULL;
  68
  69                 anon_vma = find_mergeable_anon_vma(vma);
  70                 if (!anon_vma) {
  71                         anon_vma = anon_vma_alloc();
  72                         if (unlikely(!anon_vma))
  73                                 return -ENOMEM;
  74                         allocated = anon_vma;
  75                 }
  76
  77                 /* page_table_lock to protect against threads */
  78                 spin_lock(&mm->page_table_lock);
  79                 if (likely(!vma->anon_vma)) {
  80                         if (!allocated)
  81                                 spin_lock(&anon_vma->lock);
  82                         vma->anon_vma = anon_vma;
  83                         list_add(&vma->anon_vma_node, &anon_vma->head);
  84                         if (!allocated)
  85                                 spin_unlock(&anon_vma->lock);
  86                         allocated = NULL;
  87                 }
  88                 spin_unlock(&mm->page_table_lock);
  89                 if (unlikely(allocated))
  90                         anon_vma_free(allocated);
  91         }
  92         return 0;
  93 }
  94
  95 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  96 {
  97         if (!vma->anon_vma) {
  98                 BUG_ON(!next->anon_vma);
  99                 vma->anon_vma = next->anon_vma;
 100                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 101         } else {
 102                 /* if they're both non-null they must be the same */
 103                 BUG_ON(vma->anon_vma != next->anon_vma);
 104         }
 105         list_del(&next->anon_vma_node);
 106 }
 107
 108 void __anon_vma_link(struct vm_area_struct *vma)
 109 {
 110         struct anon_vma *anon_vma = vma->anon_vma;
 111
 112         if (anon_vma) {
 113                 list_add(&vma->anon_vma_node, &anon_vma->head);
 114                 validate_anon_vma(vma);
 115         }
 116 }
 117
 118 void anon_vma_link(struct vm_area_struct *vma)
 119 {
 120         struct anon_vma *anon_vma = vma->anon_vma;
 121
 122         if (anon_vma) {
 123                 spin_lock(&anon_vma->lock);
 124                 list_add(&vma->anon_vma_node, &anon_vma->head);
 125                 validate_anon_vma(vma);
 126                 spin_unlock(&anon_vma->lock);
 127         }
 128 }
 129
 130 void anon_vma_unlink(struct vm_area_struct *vma)
 131 {
 132         struct anon_vma *anon_vma = vma->anon_vma;
 133         int empty;
 134
 135         if (!anon_vma)
 136                 return;
 137
 138         spin_lock(&anon_vma->lock);
 139         validate_anon_vma(vma);
 140         list_del(&vma->anon_vma_node);
 141
 142         /* We must garbage collect the anon_vma if it's empty */
 143         empty = list_empty(&anon_vma->head);
 144         spin_unlock(&anon_vma->lock);
 145
 146         if (empty)
 147                 anon_vma_free(anon_vma);
 148 }
 149
 150 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 151 {
 152         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 153                                                 SLAB_CTOR_CONSTRUCTOR) {
 154                 struct anon_vma *anon_vma = data;
 155
 156                 spin_lock_init(&anon_vma->lock);
 157                 INIT_LIST_HEAD(&anon_vma->head);
 158         }
 159 }
 160
 161 void __init anon_vma_init(void)
 162 {
 163         anon_vma_cachep = kmem_cache_create("anon_vma",
 164                 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 165 }
 166
 167 /* this needs the page->flags PG_maplock held */
 168 static inline void clear_page_anon(struct page *page)
 169 {
 170         BUG_ON(!page->mapping);
 171         page->mapping = NULL;
 172         ClearPageAnon(page);
 173 }
 174
 175 /*
 176  * At what user virtual address is page expected in vma?
 177  */
 178 static inline unsigned long
 179 vma_address(struct page *page, struct vm_area_struct *vma)
 180 {
 181         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 182         unsigned long address;
 183
 184         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 185         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 186                 /* page should be within any vma from prio_tree_next */
 187                 BUG_ON(!PageAnon(page));
 188                 return -EFAULT;
 189         }
 190         return address;
 191 }
 192
 193 /*
 194  * Subfunctions of page_referenced: page_referenced_one called
 195  * repeatedly from either page_referenced_anon or page_referenced_file.
 196  */
 197 static int page_referenced_one(struct page *page,
 198         struct vm_area_struct *vma, unsigned int *mapcount)
 199 {
 200         struct mm_struct *mm = vma->vm_mm;
 201         unsigned long address;
 202         pgd_t *pgd;
 203         pmd_t *pmd;
 204         pte_t *pte;
 205         int referenced = 0;
 206
 207         if (!mm->rss)
 208                 goto out;
 209         address = vma_address(page, vma);
 210         if (address == -EFAULT)
 211                 goto out;
 212
 213         if (!spin_trylock(&mm->page_table_lock))
 214                 goto out;
 215
 216         pgd = pgd_offset(mm, address);
 217         if (!pgd_present(*pgd))
 218                 goto out_unlock;
 219
 220         pmd = pmd_offset(pgd, address);
 221         if (!pmd_present(*pmd))
 222                 goto out_unlock;
 223
 224         pte = pte_offset_map(pmd, address);
 225         if (!pte_present(*pte))
 226                 goto out_unmap;
 227
 228         if (page_to_pfn(page) != pte_pfn(*pte))
 229                 goto out_unmap;
 230
 231         if (ptep_clear_flush_young(vma, address, pte))
 232                 referenced++;
 233
 234         if (mm != current->mm && has_swap_token(mm))
 235                 referenced++;
 236
 237         (*mapcount)--;
 238
 239 out_unmap:
 240         pte_unmap(pte);
 241 out_unlock:
 242         spin_unlock(&mm->page_table_lock);
 243 out:
 244         return referenced;
 245 }
 246
 247 static inline int page_referenced_anon(struct page *page)
 248 {
 249         unsigned int mapcount = page->mapcount;
 250         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 251         struct vm_area_struct *vma;
 252         int referenced = 0;
 253
 254         spin_lock(&anon_vma->lock);
 255         BUG_ON(list_empty(&anon_vma->head));
 256         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 257                 referenced += page_referenced_one(page, vma, &mapcount);
 258                 if (!mapcount)
 259                         break;
 260         }
 261         spin_unlock(&anon_vma->lock);
 262         return referenced;
 263 }
 264
 265 /**
 266  * page_referenced_file - referenced check for object-based rmap
 267  * @page: the page we're checking references on.
 268  *
 269  * For an object-based mapped page, find all the places it is mapped and
 270  * check/clear the referenced flag.  This is done by following the page->mapping
 271  * pointer, then walking the chain of vmas it holds.  It returns the number
 272  * of references it found.
 273  *
 274  * This function is only called from page_referenced for object-based pages.
 275  *
 276  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 277  * assume a reference count of 0, so try_to_unmap will then have a go.
 278  */
 279 static inline int page_referenced_file(struct page *page)
 280 {
 281         unsigned int mapcount = page->mapcount;
 282         struct address_space *mapping = page->mapping;
 283         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 284         struct vm_area_struct *vma = NULL;
 285         struct prio_tree_iter iter;
 286         int referenced = 0;
 287
 288         if (!spin_trylock(&mapping->i_mmap_lock))
 289                 return 0;
 290
 291         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 292                                         &iter, pgoff, pgoff)) != NULL) {
 293                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 294                                   == (VM_LOCKED|VM_MAYSHARE)) {
 295                         referenced++;
 296                         break;
 297                 }
 298                 referenced += page_referenced_one(page, vma, &mapcount);
 299                 if (!mapcount)
 300                         break;
 301         }
 302
 303         spin_unlock(&mapping->i_mmap_lock);
 304         return referenced;
 305 }
 306
 307 /**
 308  * page_referenced - test if the page was referenced
 309  * @page: the page to test
 310  *
 311  * Quick test_and_clear_referenced for all mappings to a page,
 312  * returns the number of ptes which referenced the page.
 313  * Caller needs to hold the rmap lock.
 314  */
 315 int page_referenced(struct page *page)
 316 {
 317         int referenced = 0;
 318
 319         if (page_test_and_clear_young(page))
 320                 referenced++;
 321
 322         if (TestClearPageReferenced(page))
 323                 referenced++;
 324
 325         if (page->mapcount && page->mapping) {
 326                 if (PageAnon(page))
 327                         referenced += page_referenced_anon(page);
 328                 else
 329                         referenced += page_referenced_file(page);
 330         }
 331         return referenced;
 332 }
 333
 334 /**
 335  * page_add_anon_rmap - add pte mapping to an anonymous page
 336  * @page:       the page to add the mapping to
 337  * @vma:        the vm area in which the mapping is added
 338  * @address:    the user virtual address mapped
 339  *
 340  * The caller needs to hold the mm->page_table_lock.
 341  */
 342 void page_add_anon_rmap(struct page *page,
 343         struct vm_area_struct *vma, unsigned long address)
 344 {
 345         struct anon_vma *anon_vma = vma->anon_vma;
 346         pgoff_t index;
 347
 348         BUG_ON(PageReserved(page));
 349         BUG_ON(!anon_vma);
 350
 351         index = (address - vma->vm_start) >> PAGE_SHIFT;
 352         index += vma->vm_pgoff;
 353         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 354
 355         /*
 356          * Setting and clearing PG_anon must always happen inside
 357          * page_map_lock to avoid races between mapping and
 358          * unmapping on different processes of the same
 359          * shared cow swapcache page. And while we take the
 360          * page_map_lock PG_anon cannot change from under us.
 361          * Actually PG_anon cannot change under fork either
 362          * since fork holds a reference on the page so it cannot
 363          * be unmapped under fork and in turn copy_page_range is
 364          * allowed to read PG_anon outside the page_map_lock.
 365          */
 366         page_map_lock(page);
 367         if (!page->mapcount) {
 368                 BUG_ON(PageAnon(page));
 369                 BUG_ON(page->mapping);
 370                 SetPageAnon(page);
 371                 page->index = index;
 372                 page->mapping = (struct address_space *) anon_vma;
 373                 inc_page_state(nr_mapped);
 374         } else {
 375                 BUG_ON(!PageAnon(page));
 376                 BUG_ON(page->index != index);
 377                 BUG_ON(page->mapping != (struct address_space *) anon_vma);
 378         }
 379         page->mapcount++;
 380         page_map_unlock(page);
 381 }
 382
 383 /**
 384  * page_add_file_rmap - add pte mapping to a file page
 385  * @page: the page to add the mapping to
 386  *
 387  * The caller needs to hold the mm->page_table_lock.
 388  */
 389 void page_add_file_rmap(struct page *page)
 390 {
 391         BUG_ON(PageAnon(page));
 392         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 393                 return;
 394
 395         page_map_lock(page);
 396         if (!page->mapcount)
 397                 inc_page_state(nr_mapped);
 398         page->mapcount++;
 399         page_map_unlock(page);
 400 }
 401
 402 /**
 403  * page_remove_rmap - take down pte mapping from a page
 404  * @page: page to remove mapping from
 405  *
 406  * Caller needs to hold the mm->page_table_lock.
 407  */
 408 void page_remove_rmap(struct page *page)
 409 {
 410         BUG_ON(PageReserved(page));
 411         BUG_ON(!page->mapcount);
 412
 413         page_map_lock(page);
 414         page->mapcount--;
 415         if (!page->mapcount) {
 416                 if (page_test_and_clear_dirty(page))
 417                         set_page_dirty(page);
 418                 if (PageAnon(page))
 419                         clear_page_anon(page);
 420                 dec_page_state(nr_mapped);
 421         }
 422         page_map_unlock(page);
 423 }
 424
 425 /*
 426  * Subfunctions of try_to_unmap: try_to_unmap_one called
 427  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 428  */
 429 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 430 {
 431         struct mm_struct *mm = vma->vm_mm;
 432         unsigned long address;
 433         pgd_t *pgd;
 434         pmd_t *pmd;
 435         pte_t *pte;
 436         pte_t pteval;
 437         int ret = SWAP_AGAIN;
 438
 439         if (!mm->rss)
 440                 goto out;
 441         address = vma_address(page, vma);
 442         if (address == -EFAULT)
 443                 goto out;
 444
 445         /*
 446          * We need the page_table_lock to protect us from page faults,
 447          * munmap, fork, etc...
 448          */
 449         if (!spin_trylock(&mm->page_table_lock))
 450                 goto out;
 451
 452         pgd = pgd_offset(mm, address);
 453         if (!pgd_present(*pgd))
 454                 goto out_unlock;
 455
 456         pmd = pmd_offset(pgd, address);
 457         if (!pmd_present(*pmd))
 458                 goto out_unlock;
 459
 460         pte = pte_offset_map(pmd, address);
 461         if (!pte_present(*pte))
 462                 goto out_unmap;
 463
 464         if (page_to_pfn(page) != pte_pfn(*pte))
 465                 goto out_unmap;
 466
 467         /*
 468          * If the page is mlock()d, we cannot swap it out.
 469          * If it's recently referenced (perhaps page_referenced
 470          * skipped over this mm) then we should reactivate it.
 471          */
 472         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 473                         ptep_clear_flush_young(vma, address, pte)) {
 474                 ret = SWAP_FAIL;
 475                 goto out_unmap;
 476         }
 477
 478         /*
 479          * Don't pull an anonymous page out from under get_user_pages.
 480          * GUP carefully breaks COW and raises page count (while holding
 481          * page_table_lock, as we have here) to make sure that the page
 482          * cannot be freed.  If we unmap that page here, a user write
 483          * access to the virtual address will bring back the page, but
 484          * its raised count will (ironically) be taken to mean it's not
 485          * an exclusive swap page, do_wp_page will replace it by a copy
 486          * page, and the user never get to see the data GUP was holding
 487          * the original page for.
 488          *
 489          * This test is also useful for when swapoff (unuse_process) has
 490          * to drop page lock: its reference to the page stops existing
 491          * ptes from being unmapped, so swapoff can make progress.
 492          */
 493         if (PageSwapCache(page) &&
 494             page_count(page) != page->mapcount + 2) {
 495                 ret = SWAP_FAIL;
 496                 goto out_unmap;
 497         }
 498
 499         /* Nuke the page table entry. */
 500         flush_cache_page(vma, address);
 501         pteval = ptep_clear_flush(vma, address, pte);
 502
 503         /* Move the dirty bit to the physical page now the pte is gone. */
 504         if (pte_dirty(pteval))
 505                 set_page_dirty(page);
 506
 507         if (PageAnon(page)) {
 508                 swp_entry_t entry = { .val = page->private };
 509                 /*
 510                  * Store the swap location in the pte.
 511                  * See handle_pte_fault() ...
 512                  */
 513                 BUG_ON(!PageSwapCache(page));
 514                 swap_duplicate(entry);
 515                 set_pte(pte, swp_entry_to_pte(entry));
 516                 BUG_ON(pte_file(*pte));
 517         }
 518
 519         // mm->rss--;
 520         vx_rsspages_dec(mm);
 521         BUG_ON(!page->mapcount);
 522         page->mapcount--;
 523         page_cache_release(page);
 524
 525 out_unmap:
 526         pte_unmap(pte);
 527 out_unlock:
 528         spin_unlock(&mm->page_table_lock);
 529 out:
 530         return ret;
 531 }
 532
 533 /*
 534  * objrmap doesn't work for nonlinear VMAs because the assumption that
 535  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 536  * Consequently, given a particular page and its ->index, we cannot locate the
 537  * ptes which are mapping that page without an exhaustive linear search.
 538  *
 539  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 540  * maps the file to which the target page belongs.  The ->vm_private_data field
 541  * holds the current cursor into that scan.  Successive searches will circulate
 542  * around the vma's virtual address space.
 543  *
 544  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 545  * more scanning pressure is placed against them as well.   Eventually pages
 546  * will become fully unmapped and are eligible for eviction.
 547  *
 548  * For very sparsely populated VMAs this is a little inefficient - chances are
 549  * there there won't be many ptes located within the scan cluster.  In this case
 550  * maybe we could scan further - to the end of the pte page, perhaps.
 551  */
 552 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 553 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 554
 555 static int try_to_unmap_cluster(unsigned long cursor,
 556         unsigned int *mapcount, struct vm_area_struct *vma)
 557 {
 558         struct mm_struct *mm = vma->vm_mm;
 559         pgd_t *pgd;
 560         pmd_t *pmd;
 561         pte_t *pte;
 562         pte_t pteval;
 563         struct page *page;
 564         unsigned long address;
 565         unsigned long end;
 566         unsigned long pfn;
 567
 568         /*
 569          * We need the page_table_lock to protect us from page faults,
 570          * munmap, fork, etc...
 571          */
 572         if (!spin_trylock(&mm->page_table_lock))
 573                 return SWAP_FAIL;
 574
 575         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 576         end = address + CLUSTER_SIZE;
 577         if (address < vma->vm_start)
 578                 address = vma->vm_start;
 579         if (end > vma->vm_end)
 580                 end = vma->vm_end;
 581
 582         pgd = pgd_offset(mm, address);
 583         if (!pgd_present(*pgd))
 584                 goto out_unlock;
 585
 586         pmd = pmd_offset(pgd, address);
 587         if (!pmd_present(*pmd))
 588                 goto out_unlock;
 589
 590         for (pte = pte_offset_map(pmd, address);
 591                         address < end; pte++, address += PAGE_SIZE) {
 592
 593                 if (!pte_present(*pte))
 594                         continue;
 595
 596                 pfn = pte_pfn(*pte);
 597                 if (!pfn_valid(pfn))
 598                         continue;
 599
 600                 page = pfn_to_page(pfn);
 601                 BUG_ON(PageAnon(page));
 602                 if (PageReserved(page))
 603                         continue;
 604
 605                 if (ptep_clear_flush_young(vma, address, pte))
 606                         continue;
 607
 608                 /* Nuke the page table entry. */
 609                 flush_cache_page(vma, address);
 610                 pteval = ptep_clear_flush(vma, address, pte);
 611
 612                 /* If nonlinear, store the file page offset in the pte. */
 613                 if (page->index != linear_page_index(vma, address))
 614                         set_pte(pte, pgoff_to_pte(page->index));
 615
 616                 /* Move the dirty bit to the physical page now the pte is gone. */
 617                 if (pte_dirty(pteval))
 618                         set_page_dirty(page);
 619
 620                 page_remove_rmap(page);
 621                 page_cache_release(page);
 622                 mm->rss--;
 623                 (*mapcount)--;
 624         }
 625
 626         pte_unmap(pte);
 627
 628 out_unlock:
 629         spin_unlock(&mm->page_table_lock);
 630         return SWAP_AGAIN;
 631 }
 632
 633 static inline int try_to_unmap_anon(struct page *page)
 634 {
 635         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 636         struct vm_area_struct *vma;
 637         int ret = SWAP_AGAIN;
 638
 639         spin_lock(&anon_vma->lock);
 640         BUG_ON(list_empty(&anon_vma->head));
 641         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 642                 ret = try_to_unmap_one(page, vma);
 643                 if (ret == SWAP_FAIL || !page->mapcount)
 644                         break;
 645         }
 646         spin_unlock(&anon_vma->lock);
 647         return ret;
 648 }
 649
 650 /**
 651  * try_to_unmap_file - unmap file page using the object-based rmap method
 652  * @page: the page to unmap
 653  *
 654  * Find all the mappings of a page using the mapping pointer and the vma chains
 655  * contained in the address_space struct it points to.
 656  *
 657  * This function is only called from try_to_unmap for object-based pages.
 658  *
 659  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 660  * return a temporary error.
 661  */
 662 static inline int try_to_unmap_file(struct page *page)
 663 {
 664         struct address_space *mapping = page->mapping;
 665         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 666         struct vm_area_struct *vma = NULL;
 667         struct prio_tree_iter iter;
 668         int ret = SWAP_AGAIN;
 669         unsigned long cursor;
 670         unsigned long max_nl_cursor = 0;
 671         unsigned long max_nl_size = 0;
 672         unsigned int mapcount;
 673
 674         if (!spin_trylock(&mapping->i_mmap_lock))
 675                 return ret;
 676
 677         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 678                                         &iter, pgoff, pgoff)) != NULL) {
 679                 ret = try_to_unmap_one(page, vma);
 680                 if (ret == SWAP_FAIL || !page->mapcount)
 681                         goto out;
 682         }
 683
 684         if (list_empty(&mapping->i_mmap_nonlinear))
 685                 goto out;
 686
 687         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 688                                                 shared.vm_set.list) {
 689                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 690                         continue;
 691                 cursor = (unsigned long) vma->vm_private_data;
 692                 if (cursor > max_nl_cursor)
 693                         max_nl_cursor = cursor;
 694                 cursor = vma->vm_end - vma->vm_start;
 695                 if (cursor > max_nl_size)
 696                         max_nl_size = cursor;
 697         }
 698
 699         if (max_nl_size == 0)   /* any nonlinears locked or reserved */
 700                 goto out;
 701
 702         /*
 703          * We don't try to search for this page in the nonlinear vmas,
 704          * and page_referenced wouldn't have found it anyway.  Instead
 705          * just walk the nonlinear vmas trying to age and unmap some.
 706          * The mapcount of the page we came in with is irrelevant,
 707          * but even so use it as a guide to how hard we should try?
 708          */
 709         mapcount = page->mapcount;
 710         page_map_unlock(page);
 711         cond_resched_lock(&mapping->i_mmap_lock);
 712
 713         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 714         if (max_nl_cursor == 0)
 715                 max_nl_cursor = CLUSTER_SIZE;
 716
 717         do {
 718                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 719                                                 shared.vm_set.list) {
 720                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 721                                 continue;
 722                         cursor = (unsigned long) vma->vm_private_data;
 723                         while (vma->vm_mm->rss &&
 724                                 cursor < max_nl_cursor &&
 725                                 cursor < vma->vm_end - vma->vm_start) {
 726                                 ret = try_to_unmap_cluster(
 727                                                 cursor, &mapcount, vma);
 728                                 if (ret == SWAP_FAIL)
 729                                         break;
 730                                 cursor += CLUSTER_SIZE;
 731                                 vma->vm_private_data = (void *) cursor;
 732                                 if ((int)mapcount <= 0)
 733                                         goto relock;
 734                         }
 735                         if (ret != SWAP_FAIL)
 736                                 vma->vm_private_data =
 737                                         (void *) max_nl_cursor;
 738                         ret = SWAP_AGAIN;
 739                 }
 740                 cond_resched_lock(&mapping->i_mmap_lock);
 741                 max_nl_cursor += CLUSTER_SIZE;
 742         } while (max_nl_cursor <= max_nl_size);
 743
 744         /*
 745          * Don't loop forever (perhaps all the remaining pages are
 746          * in locked vmas).  Reset cursor on all unreserved nonlinear
 747          * vmas, now forgetting on which ones it had fallen behind.
 748          */
 749         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 750                                                 shared.vm_set.list) {
 751                 if (!(vma->vm_flags & VM_RESERVED))
 752                         vma->vm_private_data = NULL;
 753         }
 754 relock:
 755         page_map_lock(page);
 756 out:
 757         spin_unlock(&mapping->i_mmap_lock);
 758         return ret;
 759 }
 760
 761 /**
 762  * try_to_unmap - try to remove all page table mappings to a page
 763  * @page: the page to get unmapped
 764  *
 765  * Tries to remove all the page table entries which are mapping this
 766  * page, used in the pageout path.  Caller must hold the page lock
 767  * and its rmap lock.  Return values are:
 768  *
 769  * SWAP_SUCCESS - we succeeded in removing all mappings
 770  * SWAP_AGAIN   - we missed a trylock, try again later
 771  * SWAP_FAIL    - the page is unswappable
 772  */
 773 int try_to_unmap(struct page *page)
 774 {
 775         int ret;
 776
 777         BUG_ON(PageReserved(page));
 778         BUG_ON(!PageLocked(page));
 779         BUG_ON(!page->mapcount);
 780
 781         if (PageAnon(page))
 782                 ret = try_to_unmap_anon(page);
 783         else
 784                 ret = try_to_unmap_file(page);
 785
 786         if (!page->mapcount) {
 787                 if (page_test_and_clear_dirty(page))
 788                         set_page_dirty(page);
 789                 if (PageAnon(page))
 790                         clear_page_anon(page);
 791                 dec_page_state(nr_mapped);
 792                 ret = SWAP_SUCCESS;
 793         }
 794         return ret;
 795 }