mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Locking: see "Lock ordering" summary in filemap.c.
  22  * In swapout, page_map_lock is held on entry to page_referenced and
  23  * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
  24  */
  25
  26 #include <linux/mm.h>
  27 #include <linux/pagemap.h>
  28 #include <linux/swap.h>
  29 #include <linux/swapops.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/rmap.h>
  33 #include <linux/vs_memory.h>
  34
  35 #include <asm/tlbflush.h>
  36
  37 //#define RMAP_DEBUG /* can be enabled only for debugging */
  38
  39 kmem_cache_t *anon_vma_cachep;
  40
  41 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  42 {
  43 #ifdef RMAP_DEBUG
  44         struct anon_vma *anon_vma = find_vma->anon_vma;
  45         struct vm_area_struct *vma;
  46         unsigned int mapcount = 0;
  47         int found = 0;
  48
  49         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  50                 mapcount++;
  51                 BUG_ON(mapcount > 100000);
  52                 if (vma == find_vma)
  53                         found = 1;
  54         }
  55         BUG_ON(!found);
  56 #endif
  57 }
  58
  59 /* This must be called under the mmap_sem. */
  60 int anon_vma_prepare(struct vm_area_struct *vma)
  61 {
  62         struct anon_vma *anon_vma = vma->anon_vma;
  63
  64         might_sleep();
  65         if (unlikely(!anon_vma)) {
  66                 struct mm_struct *mm = vma->vm_mm;
  67                 struct anon_vma *allocated = NULL;
  68
  69                 anon_vma = find_mergeable_anon_vma(vma);
  70                 if (!anon_vma) {
  71                         anon_vma = anon_vma_alloc();
  72                         if (unlikely(!anon_vma))
  73                                 return -ENOMEM;
  74                         allocated = anon_vma;
  75                 }
  76
  77                 /* page_table_lock to protect against threads */
  78                 spin_lock(&mm->page_table_lock);
  79                 if (likely(!vma->anon_vma)) {
  80                         if (!allocated)
  81                                 spin_lock(&anon_vma->lock);
  82                         vma->anon_vma = anon_vma;
  83                         list_add(&vma->anon_vma_node, &anon_vma->head);
  84                         if (!allocated)
  85                                 spin_unlock(&anon_vma->lock);
  86                         allocated = NULL;
  87                 }
  88                 spin_unlock(&mm->page_table_lock);
  89                 if (unlikely(allocated))
  90                         anon_vma_free(allocated);
  91         }
  92         return 0;
  93 }
  94
  95 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  96 {
  97         if (!vma->anon_vma) {
  98                 BUG_ON(!next->anon_vma);
  99                 vma->anon_vma = next->anon_vma;
 100                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 101         } else {
 102                 /* if they're both non-null they must be the same */
 103                 BUG_ON(vma->anon_vma != next->anon_vma);
 104         }
 105         list_del(&next->anon_vma_node);
 106 }
 107
 108 void __anon_vma_link(struct vm_area_struct *vma)
 109 {
 110         struct anon_vma *anon_vma = vma->anon_vma;
 111
 112         if (anon_vma) {
 113                 list_add(&vma->anon_vma_node, &anon_vma->head);
 114                 validate_anon_vma(vma);
 115         }
 116 }
 117
 118 void anon_vma_link(struct vm_area_struct *vma)
 119 {
 120         struct anon_vma *anon_vma = vma->anon_vma;
 121
 122         if (anon_vma) {
 123                 spin_lock(&anon_vma->lock);
 124                 list_add(&vma->anon_vma_node, &anon_vma->head);
 125                 validate_anon_vma(vma);
 126                 spin_unlock(&anon_vma->lock);
 127         }
 128 }
 129
 130 void anon_vma_unlink(struct vm_area_struct *vma)
 131 {
 132         struct anon_vma *anon_vma = vma->anon_vma;
 133         int empty;
 134
 135         if (!anon_vma)
 136                 return;
 137
 138         spin_lock(&anon_vma->lock);
 139         validate_anon_vma(vma);
 140         list_del(&vma->anon_vma_node);
 141
 142         /* We must garbage collect the anon_vma if it's empty */
 143         empty = list_empty(&anon_vma->head);
 144         spin_unlock(&anon_vma->lock);
 145
 146         if (empty)
 147                 anon_vma_free(anon_vma);
 148 }
 149
 150 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 151 {
 152         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 153                                                 SLAB_CTOR_CONSTRUCTOR) {
 154                 struct anon_vma *anon_vma = data;
 155
 156                 spin_lock_init(&anon_vma->lock);
 157                 INIT_LIST_HEAD(&anon_vma->head);
 158         }
 159 }
 160
 161 void __init anon_vma_init(void)
 162 {
 163         anon_vma_cachep = kmem_cache_create("anon_vma",
 164                 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 165 }
 166
 167 /* this needs the page->flags PG_maplock held */
 168 static inline void clear_page_anon(struct page *page)
 169 {
 170         BUG_ON(!page->mapping);
 171         page->mapping = NULL;
 172         ClearPageAnon(page);
 173 }
 174
 175 /*
 176  * At what user virtual address is page expected in vma?
 177  */
 178 static inline unsigned long
 179 vma_address(struct page *page, struct vm_area_struct *vma)
 180 {
 181         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 182         unsigned long address;
 183
 184         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 185         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 186                 /* page should be within any vma from prio_tree_next */
 187                 BUG_ON(!PageAnon(page));
 188                 return -EFAULT;
 189         }
 190         return address;
 191 }
 192
 193 /*
 194  * Subfunctions of page_referenced: page_referenced_one called
 195  * repeatedly from either page_referenced_anon or page_referenced_file.
 196  */
 197 static int page_referenced_one(struct page *page,
 198         struct vm_area_struct *vma, unsigned int *mapcount)
 199 {
 200         struct mm_struct *mm = vma->vm_mm;
 201         unsigned long address;
 202         pgd_t *pgd;
 203         pmd_t *pmd;
 204         pte_t *pte;
 205         int referenced = 0;
 206
 207         if (!mm->rss)
 208                 goto out;
 209         address = vma_address(page, vma);
 210         if (address == -EFAULT)
 211                 goto out;
 212
 213         if (!spin_trylock(&mm->page_table_lock))
 214                 goto out;
 215
 216         pgd = pgd_offset(mm, address);
 217         if (!pgd_present(*pgd))
 218                 goto out_unlock;
 219
 220         pmd = pmd_offset(pgd, address);
 221         if (!pmd_present(*pmd))
 222                 goto out_unlock;
 223
 224         pte = pte_offset_map(pmd, address);
 225         if (!pte_present(*pte))
 226                 goto out_unmap;
 227
 228         if (page_to_pfn(page) != pte_pfn(*pte))
 229                 goto out_unmap;
 230
 231         if (ptep_clear_flush_young(vma, address, pte))
 232                 referenced++;
 233
 234         (*mapcount)--;
 235
 236 out_unmap:
 237         pte_unmap(pte);
 238 out_unlock:
 239         spin_unlock(&mm->page_table_lock);
 240 out:
 241         return referenced;
 242 }
 243
 244 static inline int page_referenced_anon(struct page *page)
 245 {
 246         unsigned int mapcount = page->mapcount;
 247         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 248         struct vm_area_struct *vma;
 249         int referenced = 0;
 250
 251         spin_lock(&anon_vma->lock);
 252         BUG_ON(list_empty(&anon_vma->head));
 253         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 254                 referenced += page_referenced_one(page, vma, &mapcount);
 255                 if (!mapcount)
 256                         break;
 257         }
 258         spin_unlock(&anon_vma->lock);
 259         return referenced;
 260 }
 261
 262 /**
 263  * page_referenced_file - referenced check for object-based rmap
 264  * @page: the page we're checking references on.
 265  *
 266  * For an object-based mapped page, find all the places it is mapped and
 267  * check/clear the referenced flag.  This is done by following the page->mapping
 268  * pointer, then walking the chain of vmas it holds.  It returns the number
 269  * of references it found.
 270  *
 271  * This function is only called from page_referenced for object-based pages.
 272  *
 273  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 274  * assume a reference count of 0, so try_to_unmap will then have a go.
 275  */
 276 static inline int page_referenced_file(struct page *page)
 277 {
 278         unsigned int mapcount = page->mapcount;
 279         struct address_space *mapping = page->mapping;
 280         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 281         struct vm_area_struct *vma = NULL;
 282         struct prio_tree_iter iter;
 283         int referenced = 0;
 284
 285         if (!spin_trylock(&mapping->i_mmap_lock))
 286                 return 0;
 287
 288         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 289                                         &iter, pgoff, pgoff)) != NULL) {
 290                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 291                                   == (VM_LOCKED|VM_MAYSHARE)) {
 292                         referenced++;
 293                         break;
 294                 }
 295                 referenced += page_referenced_one(page, vma, &mapcount);
 296                 if (!mapcount)
 297                         break;
 298         }
 299
 300         spin_unlock(&mapping->i_mmap_lock);
 301         return referenced;
 302 }
 303
 304 /**
 305  * page_referenced - test if the page was referenced
 306  * @page: the page to test
 307  *
 308  * Quick test_and_clear_referenced for all mappings to a page,
 309  * returns the number of ptes which referenced the page.
 310  * Caller needs to hold the rmap lock.
 311  */
 312 int page_referenced(struct page *page)
 313 {
 314         int referenced = 0;
 315
 316         if (page_test_and_clear_young(page))
 317                 referenced++;
 318
 319         if (TestClearPageReferenced(page))
 320                 referenced++;
 321
 322         if (page->mapcount && page->mapping) {
 323                 if (PageAnon(page))
 324                         referenced += page_referenced_anon(page);
 325                 else
 326                         referenced += page_referenced_file(page);
 327         }
 328         return referenced;
 329 }
 330
 331 /**
 332  * page_add_anon_rmap - add pte mapping to an anonymous page
 333  * @page:       the page to add the mapping to
 334  * @vma:        the vm area in which the mapping is added
 335  * @address:    the user virtual address mapped
 336  *
 337  * The caller needs to hold the mm->page_table_lock.
 338  */
 339 void page_add_anon_rmap(struct page *page,
 340         struct vm_area_struct *vma, unsigned long address)
 341 {
 342         struct anon_vma *anon_vma = vma->anon_vma;
 343         pgoff_t index;
 344
 345         BUG_ON(PageReserved(page));
 346         BUG_ON(!anon_vma);
 347
 348         index = (address - vma->vm_start) >> PAGE_SHIFT;
 349         index += vma->vm_pgoff;
 350         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 351
 352         /*
 353          * Setting and clearing PG_anon must always happen inside
 354          * page_map_lock to avoid races between mapping and
 355          * unmapping on different processes of the same
 356          * shared cow swapcache page. And while we take the
 357          * page_map_lock PG_anon cannot change from under us.
 358          * Actually PG_anon cannot change under fork either
 359          * since fork holds a reference on the page so it cannot
 360          * be unmapped under fork and in turn copy_page_range is
 361          * allowed to read PG_anon outside the page_map_lock.
 362          */
 363         page_map_lock(page);
 364         if (!page->mapcount) {
 365                 BUG_ON(PageAnon(page));
 366                 BUG_ON(page->mapping);
 367                 SetPageAnon(page);
 368                 page->index = index;
 369                 page->mapping = (struct address_space *) anon_vma;
 370                 inc_page_state(nr_mapped);
 371         } else {
 372                 BUG_ON(!PageAnon(page));
 373                 BUG_ON(page->index != index);
 374                 BUG_ON(page->mapping != (struct address_space *) anon_vma);
 375         }
 376         page->mapcount++;
 377         page_map_unlock(page);
 378 }
 379
 380 /**
 381  * page_add_file_rmap - add pte mapping to a file page
 382  * @page: the page to add the mapping to
 383  *
 384  * The caller needs to hold the mm->page_table_lock.
 385  */
 386 void page_add_file_rmap(struct page *page)
 387 {
 388         BUG_ON(PageAnon(page));
 389         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 390                 return;
 391
 392         page_map_lock(page);
 393         if (!page->mapcount)
 394                 inc_page_state(nr_mapped);
 395         page->mapcount++;
 396         page_map_unlock(page);
 397 }
 398
 399 /**
 400  * page_remove_rmap - take down pte mapping from a page
 401  * @page: page to remove mapping from
 402  *
 403  * Caller needs to hold the mm->page_table_lock.
 404  */
 405 void page_remove_rmap(struct page *page)
 406 {
 407         BUG_ON(PageReserved(page));
 408         BUG_ON(!page->mapcount);
 409
 410         page_map_lock(page);
 411         page->mapcount--;
 412         if (!page->mapcount) {
 413                 if (page_test_and_clear_dirty(page))
 414                         set_page_dirty(page);
 415                 if (PageAnon(page))
 416                         clear_page_anon(page);
 417                 dec_page_state(nr_mapped);
 418         }
 419         page_map_unlock(page);
 420 }
 421
 422 /*
 423  * Subfunctions of try_to_unmap: try_to_unmap_one called
 424  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 425  */
 426 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 427 {
 428         struct mm_struct *mm = vma->vm_mm;
 429         unsigned long address;
 430         pgd_t *pgd;
 431         pmd_t *pmd;
 432         pte_t *pte;
 433         pte_t pteval;
 434         int ret = SWAP_AGAIN;
 435
 436         if (!mm->rss)
 437                 goto out;
 438         address = vma_address(page, vma);
 439         if (address == -EFAULT)
 440                 goto out;
 441
 442         /*
 443          * We need the page_table_lock to protect us from page faults,
 444          * munmap, fork, etc...
 445          */
 446         if (!spin_trylock(&mm->page_table_lock))
 447                 goto out;
 448
 449         pgd = pgd_offset(mm, address);
 450         if (!pgd_present(*pgd))
 451                 goto out_unlock;
 452
 453         pmd = pmd_offset(pgd, address);
 454         if (!pmd_present(*pmd))
 455                 goto out_unlock;
 456
 457         pte = pte_offset_map(pmd, address);
 458         if (!pte_present(*pte))
 459                 goto out_unmap;
 460
 461         if (page_to_pfn(page) != pte_pfn(*pte))
 462                 goto out_unmap;
 463
 464         /*
 465          * If the page is mlock()d, we cannot swap it out.
 466          * If it's recently referenced (perhaps page_referenced
 467          * skipped over this mm) then we should reactivate it.
 468          */
 469         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 470                         ptep_clear_flush_young(vma, address, pte)) {
 471                 ret = SWAP_FAIL;
 472                 goto out_unmap;
 473         }
 474
 475         /*
 476          * Don't pull an anonymous page out from under get_user_pages.
 477          * GUP carefully breaks COW and raises page count (while holding
 478          * page_table_lock, as we have here) to make sure that the page
 479          * cannot be freed.  If we unmap that page here, a user write
 480          * access to the virtual address will bring back the page, but
 481          * its raised count will (ironically) be taken to mean it's not
 482          * an exclusive swap page, do_wp_page will replace it by a copy
 483          * page, and the user never get to see the data GUP was holding
 484          * the original page for.
 485          */
 486         if (PageSwapCache(page) &&
 487             page_count(page) != page->mapcount + 2) {
 488                 ret = SWAP_FAIL;
 489                 goto out_unmap;
 490         }
 491
 492         /* Nuke the page table entry. */
 493         flush_cache_page(vma, address);
 494         pteval = ptep_clear_flush(vma, address, pte);
 495
 496         /* Move the dirty bit to the physical page now the pte is gone. */
 497         if (pte_dirty(pteval))
 498                 set_page_dirty(page);
 499
 500         if (PageAnon(page)) {
 501                 swp_entry_t entry = { .val = page->private };
 502                 /*
 503                  * Store the swap location in the pte.
 504                  * See handle_pte_fault() ...
 505                  */
 506                 BUG_ON(!PageSwapCache(page));
 507                 swap_duplicate(entry);
 508                 set_pte(pte, swp_entry_to_pte(entry));
 509                 BUG_ON(pte_file(*pte));
 510         }
 511
 512         // mm->rss--;
 513         vx_rsspages_dec(mm);
 514         BUG_ON(!page->mapcount);
 515         page->mapcount--;
 516         page_cache_release(page);
 517
 518 out_unmap:
 519         pte_unmap(pte);
 520 out_unlock:
 521         spin_unlock(&mm->page_table_lock);
 522 out:
 523         return ret;
 524 }
 525
 526 /*
 527  * objrmap doesn't work for nonlinear VMAs because the assumption that
 528  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 529  * Consequently, given a particular page and its ->index, we cannot locate the
 530  * ptes which are mapping that page without an exhaustive linear search.
 531  *
 532  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 533  * maps the file to which the target page belongs.  The ->vm_private_data field
 534  * holds the current cursor into that scan.  Successive searches will circulate
 535  * around the vma's virtual address space.
 536  *
 537  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 538  * more scanning pressure is placed against them as well.   Eventually pages
 539  * will become fully unmapped and are eligible for eviction.
 540  *
 541  * For very sparsely populated VMAs this is a little inefficient - chances are
 542  * there there won't be many ptes located within the scan cluster.  In this case
 543  * maybe we could scan further - to the end of the pte page, perhaps.
 544  */
 545 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 546 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 547
 548 static int try_to_unmap_cluster(unsigned long cursor,
 549         unsigned int *mapcount, struct vm_area_struct *vma)
 550 {
 551         struct mm_struct *mm = vma->vm_mm;
 552         pgd_t *pgd;
 553         pmd_t *pmd;
 554         pte_t *pte;
 555         pte_t pteval;
 556         struct page *page;
 557         unsigned long address;
 558         unsigned long end;
 559         unsigned long pfn;
 560
 561         /*
 562          * We need the page_table_lock to protect us from page faults,
 563          * munmap, fork, etc...
 564          */
 565         if (!spin_trylock(&mm->page_table_lock))
 566                 return SWAP_FAIL;
 567
 568         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 569         end = address + CLUSTER_SIZE;
 570         if (address < vma->vm_start)
 571                 address = vma->vm_start;
 572         if (end > vma->vm_end)
 573                 end = vma->vm_end;
 574
 575         pgd = pgd_offset(mm, address);
 576         if (!pgd_present(*pgd))
 577                 goto out_unlock;
 578
 579         pmd = pmd_offset(pgd, address);
 580         if (!pmd_present(*pmd))
 581                 goto out_unlock;
 582
 583         for (pte = pte_offset_map(pmd, address);
 584                         address < end; pte++, address += PAGE_SIZE) {
 585
 586                 if (!pte_present(*pte))
 587                         continue;
 588
 589                 pfn = pte_pfn(*pte);
 590                 if (!pfn_valid(pfn))
 591                         continue;
 592
 593                 page = pfn_to_page(pfn);
 594                 BUG_ON(PageAnon(page));
 595                 if (PageReserved(page))
 596                         continue;
 597
 598                 if (ptep_clear_flush_young(vma, address, pte))
 599                         continue;
 600
 601                 /* Nuke the page table entry. */
 602                 flush_cache_page(vma, address);
 603                 pteval = ptep_clear_flush(vma, address, pte);
 604
 605                 /* If nonlinear, store the file page offset in the pte. */
 606                 if (page->index != linear_page_index(vma, address))
 607                         set_pte(pte, pgoff_to_pte(page->index));
 608
 609                 /* Move the dirty bit to the physical page now the pte is gone. */
 610                 if (pte_dirty(pteval))
 611                         set_page_dirty(page);
 612
 613                 page_remove_rmap(page);
 614                 page_cache_release(page);
 615                 mm->rss--;
 616                 (*mapcount)--;
 617         }
 618
 619         pte_unmap(pte);
 620
 621 out_unlock:
 622         spin_unlock(&mm->page_table_lock);
 623         return SWAP_AGAIN;
 624 }
 625
 626 static inline int try_to_unmap_anon(struct page *page)
 627 {
 628         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 629         struct vm_area_struct *vma;
 630         int ret = SWAP_AGAIN;
 631
 632         spin_lock(&anon_vma->lock);
 633         BUG_ON(list_empty(&anon_vma->head));
 634         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 635                 ret = try_to_unmap_one(page, vma);
 636                 if (ret == SWAP_FAIL || !page->mapcount)
 637                         break;
 638         }
 639         spin_unlock(&anon_vma->lock);
 640         return ret;
 641 }
 642
 643 /**
 644  * try_to_unmap_file - unmap file page using the object-based rmap method
 645  * @page: the page to unmap
 646  *
 647  * Find all the mappings of a page using the mapping pointer and the vma chains
 648  * contained in the address_space struct it points to.
 649  *
 650  * This function is only called from try_to_unmap for object-based pages.
 651  *
 652  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 653  * return a temporary error.
 654  */
 655 static inline int try_to_unmap_file(struct page *page)
 656 {
 657         struct address_space *mapping = page->mapping;
 658         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 659         struct vm_area_struct *vma = NULL;
 660         struct prio_tree_iter iter;
 661         int ret = SWAP_AGAIN;
 662         unsigned long cursor;
 663         unsigned long max_nl_cursor = 0;
 664         unsigned long max_nl_size = 0;
 665         unsigned int mapcount;
 666
 667         if (!spin_trylock(&mapping->i_mmap_lock))
 668                 return ret;
 669
 670         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 671                                         &iter, pgoff, pgoff)) != NULL) {
 672                 ret = try_to_unmap_one(page, vma);
 673                 if (ret == SWAP_FAIL || !page->mapcount)
 674                         goto out;
 675         }
 676
 677         if (list_empty(&mapping->i_mmap_nonlinear))
 678                 goto out;
 679
 680         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 681                                                 shared.vm_set.list) {
 682                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 683                         continue;
 684                 cursor = (unsigned long) vma->vm_private_data;
 685                 if (cursor > max_nl_cursor)
 686                         max_nl_cursor = cursor;
 687                 cursor = vma->vm_end - vma->vm_start;
 688                 if (cursor > max_nl_size)
 689                         max_nl_size = cursor;
 690         }
 691
 692         if (max_nl_size == 0)   /* any nonlinears locked or reserved */
 693                 goto out;
 694
 695         /*
 696          * We don't try to search for this page in the nonlinear vmas,
 697          * and page_referenced wouldn't have found it anyway.  Instead
 698          * just walk the nonlinear vmas trying to age and unmap some.
 699          * The mapcount of the page we came in with is irrelevant,
 700          * but even so use it as a guide to how hard we should try?
 701          */
 702         mapcount = page->mapcount;
 703         page_map_unlock(page);
 704         cond_resched_lock(&mapping->i_mmap_lock);
 705
 706         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 707         if (max_nl_cursor == 0)
 708                 max_nl_cursor = CLUSTER_SIZE;
 709
 710         do {
 711                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 712                                                 shared.vm_set.list) {
 713                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 714                                 continue;
 715                         cursor = (unsigned long) vma->vm_private_data;
 716                         while (vma->vm_mm->rss &&
 717                                 cursor < max_nl_cursor &&
 718                                 cursor < vma->vm_end - vma->vm_start) {
 719                                 ret = try_to_unmap_cluster(
 720                                                 cursor, &mapcount, vma);
 721                                 if (ret == SWAP_FAIL)
 722                                         break;
 723                                 cursor += CLUSTER_SIZE;
 724                                 vma->vm_private_data = (void *) cursor;
 725                                 if ((int)mapcount <= 0)
 726                                         goto relock;
 727                         }
 728                         if (ret != SWAP_FAIL)
 729                                 vma->vm_private_data =
 730                                         (void *) max_nl_cursor;
 731                         ret = SWAP_AGAIN;
 732                 }
 733                 cond_resched_lock(&mapping->i_mmap_lock);
 734                 max_nl_cursor += CLUSTER_SIZE;
 735         } while (max_nl_cursor <= max_nl_size);
 736
 737         /*
 738          * Don't loop forever (perhaps all the remaining pages are
 739          * in locked vmas).  Reset cursor on all unreserved nonlinear
 740          * vmas, now forgetting on which ones it had fallen behind.
 741          */
 742         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 743                                                 shared.vm_set.list) {
 744                 if (!(vma->vm_flags & VM_RESERVED))
 745                         vma->vm_private_data = NULL;
 746         }
 747 relock:
 748         page_map_lock(page);
 749 out:
 750         spin_unlock(&mapping->i_mmap_lock);
 751         return ret;
 752 }
 753
 754 /**
 755  * try_to_unmap - try to remove all page table mappings to a page
 756  * @page: the page to get unmapped
 757  *
 758  * Tries to remove all the page table entries which are mapping this
 759  * page, used in the pageout path.  Caller must hold the page lock
 760  * and its rmap lock.  Return values are:
 761  *
 762  * SWAP_SUCCESS - we succeeded in removing all mappings
 763  * SWAP_AGAIN   - we missed a trylock, try again later
 764  * SWAP_FAIL    - the page is unswappable
 765  */
 766 int try_to_unmap(struct page *page)
 767 {
 768         int ret;
 769
 770         BUG_ON(PageReserved(page));
 771         BUG_ON(!PageLocked(page));
 772         BUG_ON(!page->mapcount);
 773
 774         if (PageAnon(page))
 775                 ret = try_to_unmap_anon(page);
 776         else
 777                 ret = try_to_unmap_file(page);
 778
 779         if (!page->mapcount) {
 780                 if (page_test_and_clear_dirty(page))
 781                         set_page_dirty(page);
 782                 if (PageAnon(page))
 783                         clear_page_anon(page);
 784                 dec_page_state(nr_mapped);
 785                 ret = SWAP_SUCCESS;
 786         }
 787         return ret;
 788 }