mm/rmap.c

   1 /*
   2  * mm/rmap.c - physical to virtual reverse mappings
   3  *
   4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
   5  * Released under the General Public License (GPL).
   6  *
   7  * Simple, low overhead reverse mapping scheme.
   8  * Please try to keep this thing as modular as possible.
   9  *
  10  * Provides methods for unmapping each kind of mapped page:
  11  * the anon methods track anonymous pages, and
  12  * the file methods track pages belonging to an inode.
  13  *
  14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
  15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
  16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
  17  * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
  18  */
  19
  20 /*
  21  * Locking: see "Lock ordering" summary in filemap.c.
  22  * In swapout, page_map_lock is held on entry to page_referenced and
  23  * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock.
  24  */
  25
  26 #include <linux/mm.h>
  27 #include <linux/pagemap.h>
  28 #include <linux/swap.h>
  29 #include <linux/swapops.h>
  30 #include <linux/slab.h>
  31 #include <linux/init.h>
  32 #include <linux/rmap.h>
  33
  34 #include <asm/tlbflush.h>
  35
  36 //#define RMAP_DEBUG /* can be enabled only for debugging */
  37
  38 kmem_cache_t *anon_vma_cachep;
  39
  40 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
  41 {
  42 #ifdef RMAP_DEBUG
  43         struct anon_vma *anon_vma = find_vma->anon_vma;
  44         struct vm_area_struct *vma;
  45         unsigned int mapcount = 0;
  46         int found = 0;
  47
  48         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
  49                 mapcount++;
  50                 BUG_ON(mapcount > 100000);
  51                 if (vma == find_vma)
  52                         found = 1;
  53         }
  54         BUG_ON(!found);
  55 #endif
  56 }
  57
  58 /* This must be called under the mmap_sem. */
  59 int anon_vma_prepare(struct vm_area_struct *vma)
  60 {
  61         struct anon_vma *anon_vma = vma->anon_vma;
  62
  63         might_sleep();
  64         if (unlikely(!anon_vma)) {
  65                 struct mm_struct *mm = vma->vm_mm;
  66                 struct anon_vma *allocated = NULL;
  67
  68                 anon_vma = find_mergeable_anon_vma(vma);
  69                 if (!anon_vma) {
  70                         anon_vma = anon_vma_alloc();
  71                         if (unlikely(!anon_vma))
  72                                 return -ENOMEM;
  73                         allocated = anon_vma;
  74                 }
  75
  76                 /* page_table_lock to protect against threads */
  77                 spin_lock(&mm->page_table_lock);
  78                 if (likely(!vma->anon_vma)) {
  79                         if (!allocated)
  80                                 spin_lock(&anon_vma->lock);
  81                         vma->anon_vma = anon_vma;
  82                         list_add(&vma->anon_vma_node, &anon_vma->head);
  83                         if (!allocated)
  84                                 spin_unlock(&anon_vma->lock);
  85                         allocated = NULL;
  86                 }
  87                 spin_unlock(&mm->page_table_lock);
  88                 if (unlikely(allocated))
  89                         anon_vma_free(allocated);
  90         }
  91         return 0;
  92 }
  93
  94 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
  95 {
  96         if (!vma->anon_vma) {
  97                 BUG_ON(!next->anon_vma);
  98                 vma->anon_vma = next->anon_vma;
  99                 list_add(&vma->anon_vma_node, &next->anon_vma_node);
 100         } else {
 101                 /* if they're both non-null they must be the same */
 102                 BUG_ON(vma->anon_vma != next->anon_vma);
 103         }
 104         list_del(&next->anon_vma_node);
 105 }
 106
 107 void __anon_vma_link(struct vm_area_struct *vma)
 108 {
 109         struct anon_vma *anon_vma = vma->anon_vma;
 110
 111         if (anon_vma) {
 112                 list_add(&vma->anon_vma_node, &anon_vma->head);
 113                 validate_anon_vma(vma);
 114         }
 115 }
 116
 117 void anon_vma_link(struct vm_area_struct *vma)
 118 {
 119         struct anon_vma *anon_vma = vma->anon_vma;
 120
 121         if (anon_vma) {
 122                 spin_lock(&anon_vma->lock);
 123                 list_add(&vma->anon_vma_node, &anon_vma->head);
 124                 validate_anon_vma(vma);
 125                 spin_unlock(&anon_vma->lock);
 126         }
 127 }
 128
 129 void anon_vma_unlink(struct vm_area_struct *vma)
 130 {
 131         struct anon_vma *anon_vma = vma->anon_vma;
 132         int empty;
 133
 134         if (!anon_vma)
 135                 return;
 136
 137         spin_lock(&anon_vma->lock);
 138         validate_anon_vma(vma);
 139         list_del(&vma->anon_vma_node);
 140
 141         /* We must garbage collect the anon_vma if it's empty */
 142         empty = list_empty(&anon_vma->head);
 143         spin_unlock(&anon_vma->lock);
 144
 145         if (empty)
 146                 anon_vma_free(anon_vma);
 147 }
 148
 149 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
 150 {
 151         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 152                                                 SLAB_CTOR_CONSTRUCTOR) {
 153                 struct anon_vma *anon_vma = data;
 154
 155                 spin_lock_init(&anon_vma->lock);
 156                 INIT_LIST_HEAD(&anon_vma->head);
 157         }
 158 }
 159
 160 void __init anon_vma_init(void)
 161 {
 162         anon_vma_cachep = kmem_cache_create("anon_vma",
 163                 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
 164 }
 165
 166 /* this needs the page->flags PG_maplock held */
 167 static inline void clear_page_anon(struct page *page)
 168 {
 169         BUG_ON(!page->mapping);
 170         page->mapping = NULL;
 171         ClearPageAnon(page);
 172 }
 173
 174 /*
 175  * At what user virtual address is page expected in vma?
 176  */
 177 static inline unsigned long
 178 vma_address(struct page *page, struct vm_area_struct *vma)
 179 {
 180         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 181         unsigned long address;
 182
 183         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 184         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
 185                 /* page should be within any vma from prio_tree_next */
 186                 BUG_ON(!PageAnon(page));
 187                 return -EFAULT;
 188         }
 189         return address;
 190 }
 191
 192 /*
 193  * Subfunctions of page_referenced: page_referenced_one called
 194  * repeatedly from either page_referenced_anon or page_referenced_file.
 195  */
 196 static int page_referenced_one(struct page *page,
 197         struct vm_area_struct *vma, unsigned int *mapcount)
 198 {
 199         struct mm_struct *mm = vma->vm_mm;
 200         unsigned long address;
 201         pgd_t *pgd;
 202         pmd_t *pmd;
 203         pte_t *pte;
 204         int referenced = 0;
 205
 206         if (!mm->rss)
 207                 goto out;
 208         address = vma_address(page, vma);
 209         if (address == -EFAULT)
 210                 goto out;
 211
 212         if (!spin_trylock(&mm->page_table_lock))
 213                 goto out;
 214
 215         pgd = pgd_offset(mm, address);
 216         if (!pgd_present(*pgd))
 217                 goto out_unlock;
 218
 219         pmd = pmd_offset(pgd, address);
 220         if (!pmd_present(*pmd))
 221                 goto out_unlock;
 222
 223         pte = pte_offset_map(pmd, address);
 224         if (!pte_present(*pte))
 225                 goto out_unmap;
 226
 227         if (page_to_pfn(page) != pte_pfn(*pte))
 228                 goto out_unmap;
 229
 230         if (ptep_clear_flush_young(vma, address, pte))
 231                 referenced++;
 232
 233         (*mapcount)--;
 234
 235 out_unmap:
 236         pte_unmap(pte);
 237 out_unlock:
 238         spin_unlock(&mm->page_table_lock);
 239 out:
 240         return referenced;
 241 }
 242
 243 static inline int page_referenced_anon(struct page *page)
 244 {
 245         unsigned int mapcount = page->mapcount;
 246         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 247         struct vm_area_struct *vma;
 248         int referenced = 0;
 249
 250         spin_lock(&anon_vma->lock);
 251         BUG_ON(list_empty(&anon_vma->head));
 252         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 253                 referenced += page_referenced_one(page, vma, &mapcount);
 254                 if (!mapcount)
 255                         break;
 256         }
 257         spin_unlock(&anon_vma->lock);
 258         return referenced;
 259 }
 260
 261 /**
 262  * page_referenced_file - referenced check for object-based rmap
 263  * @page: the page we're checking references on.
 264  *
 265  * For an object-based mapped page, find all the places it is mapped and
 266  * check/clear the referenced flag.  This is done by following the page->mapping
 267  * pointer, then walking the chain of vmas it holds.  It returns the number
 268  * of references it found.
 269  *
 270  * This function is only called from page_referenced for object-based pages.
 271  *
 272  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 273  * assume a reference count of 0, so try_to_unmap will then have a go.
 274  */
 275 static inline int page_referenced_file(struct page *page)
 276 {
 277         unsigned int mapcount = page->mapcount;
 278         struct address_space *mapping = page->mapping;
 279         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 280         struct vm_area_struct *vma = NULL;
 281         struct prio_tree_iter iter;
 282         int referenced = 0;
 283
 284         if (!spin_trylock(&mapping->i_mmap_lock))
 285                 return 0;
 286
 287         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 288                                         &iter, pgoff, pgoff)) != NULL) {
 289                 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 290                                   == (VM_LOCKED|VM_MAYSHARE)) {
 291                         referenced++;
 292                         break;
 293                 }
 294                 referenced += page_referenced_one(page, vma, &mapcount);
 295                 if (!mapcount)
 296                         break;
 297         }
 298
 299         spin_unlock(&mapping->i_mmap_lock);
 300         return referenced;
 301 }
 302
 303 /**
 304  * page_referenced - test if the page was referenced
 305  * @page: the page to test
 306  *
 307  * Quick test_and_clear_referenced for all mappings to a page,
 308  * returns the number of ptes which referenced the page.
 309  * Caller needs to hold the rmap lock.
 310  */
 311 int page_referenced(struct page *page)
 312 {
 313         int referenced = 0;
 314
 315         if (page_test_and_clear_young(page))
 316                 referenced++;
 317
 318         if (TestClearPageReferenced(page))
 319                 referenced++;
 320
 321         if (page->mapcount && page->mapping) {
 322                 if (PageAnon(page))
 323                         referenced += page_referenced_anon(page);
 324                 else
 325                         referenced += page_referenced_file(page);
 326         }
 327         return referenced;
 328 }
 329
 330 /**
 331  * page_add_anon_rmap - add pte mapping to an anonymous page
 332  * @page:       the page to add the mapping to
 333  * @vma:        the vm area in which the mapping is added
 334  * @address:    the user virtual address mapped
 335  *
 336  * The caller needs to hold the mm->page_table_lock.
 337  */
 338 void page_add_anon_rmap(struct page *page,
 339         struct vm_area_struct *vma, unsigned long address)
 340 {
 341         struct anon_vma *anon_vma = vma->anon_vma;
 342         pgoff_t index;
 343
 344         BUG_ON(PageReserved(page));
 345         BUG_ON(!anon_vma);
 346
 347         index = (address - vma->vm_start) >> PAGE_SHIFT;
 348         index += vma->vm_pgoff;
 349         index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 350
 351         /*
 352          * Setting and clearing PG_anon must always happen inside
 353          * page_map_lock to avoid races between mapping and
 354          * unmapping on different processes of the same
 355          * shared cow swapcache page. And while we take the
 356          * page_map_lock PG_anon cannot change from under us.
 357          * Actually PG_anon cannot change under fork either
 358          * since fork holds a reference on the page so it cannot
 359          * be unmapped under fork and in turn copy_page_range is
 360          * allowed to read PG_anon outside the page_map_lock.
 361          */
 362         page_map_lock(page);
 363         if (!page->mapcount) {
 364                 BUG_ON(PageAnon(page));
 365                 BUG_ON(page->mapping);
 366                 SetPageAnon(page);
 367                 page->index = index;
 368                 page->mapping = (struct address_space *) anon_vma;
 369                 inc_page_state(nr_mapped);
 370         } else {
 371                 BUG_ON(!PageAnon(page));
 372                 BUG_ON(page->index != index);
 373                 BUG_ON(page->mapping != (struct address_space *) anon_vma);
 374         }
 375         page->mapcount++;
 376         page_map_unlock(page);
 377 }
 378
 379 /**
 380  * page_add_file_rmap - add pte mapping to a file page
 381  * @page: the page to add the mapping to
 382  *
 383  * The caller needs to hold the mm->page_table_lock.
 384  */
 385 void page_add_file_rmap(struct page *page)
 386 {
 387         BUG_ON(PageAnon(page));
 388         if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 389                 return;
 390
 391         page_map_lock(page);
 392         if (!page->mapcount)
 393                 inc_page_state(nr_mapped);
 394         page->mapcount++;
 395         page_map_unlock(page);
 396 }
 397
 398 /**
 399  * page_remove_rmap - take down pte mapping from a page
 400  * @page: page to remove mapping from
 401  *
 402  * Caller needs to hold the mm->page_table_lock.
 403  */
 404 void page_remove_rmap(struct page *page)
 405 {
 406         BUG_ON(PageReserved(page));
 407         BUG_ON(!page->mapcount);
 408
 409         page_map_lock(page);
 410         page->mapcount--;
 411         if (!page->mapcount) {
 412                 if (page_test_and_clear_dirty(page))
 413                         set_page_dirty(page);
 414                 if (PageAnon(page))
 415                         clear_page_anon(page);
 416                 dec_page_state(nr_mapped);
 417         }
 418         page_map_unlock(page);
 419 }
 420
 421 /*
 422  * Subfunctions of try_to_unmap: try_to_unmap_one called
 423  * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 424  */
 425 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 426 {
 427         struct mm_struct *mm = vma->vm_mm;
 428         unsigned long address;
 429         pgd_t *pgd;
 430         pmd_t *pmd;
 431         pte_t *pte;
 432         pte_t pteval;
 433         int ret = SWAP_AGAIN;
 434
 435         if (!mm->rss)
 436                 goto out;
 437         address = vma_address(page, vma);
 438         if (address == -EFAULT)
 439                 goto out;
 440
 441         /*
 442          * We need the page_table_lock to protect us from page faults,
 443          * munmap, fork, etc...
 444          */
 445         if (!spin_trylock(&mm->page_table_lock))
 446                 goto out;
 447
 448         pgd = pgd_offset(mm, address);
 449         if (!pgd_present(*pgd))
 450                 goto out_unlock;
 451
 452         pmd = pmd_offset(pgd, address);
 453         if (!pmd_present(*pmd))
 454                 goto out_unlock;
 455
 456         pte = pte_offset_map(pmd, address);
 457         if (!pte_present(*pte))
 458                 goto out_unmap;
 459
 460         if (page_to_pfn(page) != pte_pfn(*pte))
 461                 goto out_unmap;
 462
 463         /*
 464          * If the page is mlock()d, we cannot swap it out.
 465          * If it's recently referenced (perhaps page_referenced
 466          * skipped over this mm) then we should reactivate it.
 467          */
 468         if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 469                         ptep_clear_flush_young(vma, address, pte)) {
 470                 ret = SWAP_FAIL;
 471                 goto out_unmap;
 472         }
 473
 474         /*
 475          * Don't pull an anonymous page out from under get_user_pages.
 476          * GUP carefully breaks COW and raises page count (while holding
 477          * page_table_lock, as we have here) to make sure that the page
 478          * cannot be freed.  If we unmap that page here, a user write
 479          * access to the virtual address will bring back the page, but
 480          * its raised count will (ironically) be taken to mean it's not
 481          * an exclusive swap page, do_wp_page will replace it by a copy
 482          * page, and the user never get to see the data GUP was holding
 483          * the original page for.
 484          */
 485         if (PageSwapCache(page) &&
 486             page_count(page) != page->mapcount + 2) {
 487                 ret = SWAP_FAIL;
 488                 goto out_unmap;
 489         }
 490
 491         /* Nuke the page table entry. */
 492         flush_cache_page(vma, address);
 493         pteval = ptep_clear_flush(vma, address, pte);
 494
 495         /* Move the dirty bit to the physical page now the pte is gone. */
 496         if (pte_dirty(pteval))
 497                 set_page_dirty(page);
 498
 499         if (PageAnon(page)) {
 500                 swp_entry_t entry = { .val = page->private };
 501                 /*
 502                  * Store the swap location in the pte.
 503                  * See handle_pte_fault() ...
 504                  */
 505                 BUG_ON(!PageSwapCache(page));
 506                 swap_duplicate(entry);
 507                 set_pte(pte, swp_entry_to_pte(entry));
 508                 BUG_ON(pte_file(*pte));
 509         }
 510
 511         mm->rss--;
 512         BUG_ON(!page->mapcount);
 513         page->mapcount--;
 514         page_cache_release(page);
 515
 516 out_unmap:
 517         pte_unmap(pte);
 518 out_unlock:
 519         spin_unlock(&mm->page_table_lock);
 520 out:
 521         return ret;
 522 }
 523
 524 /*
 525  * objrmap doesn't work for nonlinear VMAs because the assumption that
 526  * offset-into-file correlates with offset-into-virtual-addresses does not hold.
 527  * Consequently, given a particular page and its ->index, we cannot locate the
 528  * ptes which are mapping that page without an exhaustive linear search.
 529  *
 530  * So what this code does is a mini "virtual scan" of each nonlinear VMA which
 531  * maps the file to which the target page belongs.  The ->vm_private_data field
 532  * holds the current cursor into that scan.  Successive searches will circulate
 533  * around the vma's virtual address space.
 534  *
 535  * So as more replacement pressure is applied to the pages in a nonlinear VMA,
 536  * more scanning pressure is placed against them as well.   Eventually pages
 537  * will become fully unmapped and are eligible for eviction.
 538  *
 539  * For very sparsely populated VMAs this is a little inefficient - chances are
 540  * there there won't be many ptes located within the scan cluster.  In this case
 541  * maybe we could scan further - to the end of the pte page, perhaps.
 542  */
 543 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 544 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
 545
 546 static int try_to_unmap_cluster(unsigned long cursor,
 547         unsigned int *mapcount, struct vm_area_struct *vma)
 548 {
 549         struct mm_struct *mm = vma->vm_mm;
 550         pgd_t *pgd;
 551         pmd_t *pmd;
 552         pte_t *pte;
 553         pte_t pteval;
 554         struct page *page;
 555         unsigned long address;
 556         unsigned long end;
 557         unsigned long pfn;
 558
 559         /*
 560          * We need the page_table_lock to protect us from page faults,
 561          * munmap, fork, etc...
 562          */
 563         if (!spin_trylock(&mm->page_table_lock))
 564                 return SWAP_FAIL;
 565
 566         address = (vma->vm_start + cursor) & CLUSTER_MASK;
 567         end = address + CLUSTER_SIZE;
 568         if (address < vma->vm_start)
 569                 address = vma->vm_start;
 570         if (end > vma->vm_end)
 571                 end = vma->vm_end;
 572
 573         pgd = pgd_offset(mm, address);
 574         if (!pgd_present(*pgd))
 575                 goto out_unlock;
 576
 577         pmd = pmd_offset(pgd, address);
 578         if (!pmd_present(*pmd))
 579                 goto out_unlock;
 580
 581         for (pte = pte_offset_map(pmd, address);
 582                         address < end; pte++, address += PAGE_SIZE) {
 583
 584                 if (!pte_present(*pte))
 585                         continue;
 586
 587                 pfn = pte_pfn(*pte);
 588                 if (!pfn_valid(pfn))
 589                         continue;
 590
 591                 page = pfn_to_page(pfn);
 592                 BUG_ON(PageAnon(page));
 593                 if (PageReserved(page))
 594                         continue;
 595
 596                 if (ptep_clear_flush_young(vma, address, pte))
 597                         continue;
 598
 599                 /* Nuke the page table entry. */
 600                 flush_cache_page(vma, address);
 601                 pteval = ptep_clear_flush(vma, address, pte);
 602
 603                 /* If nonlinear, store the file page offset in the pte. */
 604                 if (page->index != linear_page_index(vma, address))
 605                         set_pte(pte, pgoff_to_pte(page->index));
 606
 607                 /* Move the dirty bit to the physical page now the pte is gone. */
 608                 if (pte_dirty(pteval))
 609                         set_page_dirty(page);
 610
 611                 page_remove_rmap(page);
 612                 page_cache_release(page);
 613                 mm->rss--;
 614                 (*mapcount)--;
 615         }
 616
 617         pte_unmap(pte);
 618
 619 out_unlock:
 620         spin_unlock(&mm->page_table_lock);
 621         return SWAP_AGAIN;
 622 }
 623
 624 static inline int try_to_unmap_anon(struct page *page)
 625 {
 626         struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
 627         struct vm_area_struct *vma;
 628         int ret = SWAP_AGAIN;
 629
 630         spin_lock(&anon_vma->lock);
 631         BUG_ON(list_empty(&anon_vma->head));
 632         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 633                 ret = try_to_unmap_one(page, vma);
 634                 if (ret == SWAP_FAIL || !page->mapcount)
 635                         break;
 636         }
 637         spin_unlock(&anon_vma->lock);
 638         return ret;
 639 }
 640
 641 /**
 642  * try_to_unmap_file - unmap file page using the object-based rmap method
 643  * @page: the page to unmap
 644  *
 645  * Find all the mappings of a page using the mapping pointer and the vma chains
 646  * contained in the address_space struct it points to.
 647  *
 648  * This function is only called from try_to_unmap for object-based pages.
 649  *
 650  * The spinlock address_space->i_mmap_lock is tried.  If it can't be gotten,
 651  * return a temporary error.
 652  */
 653 static inline int try_to_unmap_file(struct page *page)
 654 {
 655         struct address_space *mapping = page->mapping;
 656         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 657         struct vm_area_struct *vma = NULL;
 658         struct prio_tree_iter iter;
 659         int ret = SWAP_AGAIN;
 660         unsigned long cursor;
 661         unsigned long max_nl_cursor = 0;
 662         unsigned long max_nl_size = 0;
 663         unsigned int mapcount;
 664
 665         if (!spin_trylock(&mapping->i_mmap_lock))
 666                 return ret;
 667
 668         while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
 669                                         &iter, pgoff, pgoff)) != NULL) {
 670                 ret = try_to_unmap_one(page, vma);
 671                 if (ret == SWAP_FAIL || !page->mapcount)
 672                         goto out;
 673         }
 674
 675         if (list_empty(&mapping->i_mmap_nonlinear))
 676                 goto out;
 677
 678         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 679                                                 shared.vm_set.list) {
 680                 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 681                         continue;
 682                 cursor = (unsigned long) vma->vm_private_data;
 683                 if (cursor > max_nl_cursor)
 684                         max_nl_cursor = cursor;
 685                 cursor = vma->vm_end - vma->vm_start;
 686                 if (cursor > max_nl_size)
 687                         max_nl_size = cursor;
 688         }
 689
 690         if (max_nl_size == 0)   /* any nonlinears locked or reserved */
 691                 goto out;
 692
 693         /*
 694          * We don't try to search for this page in the nonlinear vmas,
 695          * and page_referenced wouldn't have found it anyway.  Instead
 696          * just walk the nonlinear vmas trying to age and unmap some.
 697          * The mapcount of the page we came in with is irrelevant,
 698          * but even so use it as a guide to how hard we should try?
 699          */
 700         mapcount = page->mapcount;
 701         page_map_unlock(page);
 702         cond_resched_lock(&mapping->i_mmap_lock);
 703
 704         max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
 705         if (max_nl_cursor == 0)
 706                 max_nl_cursor = CLUSTER_SIZE;
 707
 708         do {
 709                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 710                                                 shared.vm_set.list) {
 711                         if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 712                                 continue;
 713                         cursor = (unsigned long) vma->vm_private_data;
 714                         while (vma->vm_mm->rss &&
 715                                 cursor < max_nl_cursor &&
 716                                 cursor < vma->vm_end - vma->vm_start) {
 717                                 ret = try_to_unmap_cluster(
 718                                                 cursor, &mapcount, vma);
 719                                 if (ret == SWAP_FAIL)
 720                                         break;
 721                                 cursor += CLUSTER_SIZE;
 722                                 vma->vm_private_data = (void *) cursor;
 723                                 if ((int)mapcount <= 0)
 724                                         goto relock;
 725                         }
 726                         if (ret != SWAP_FAIL)
 727                                 vma->vm_private_data =
 728                                         (void *) max_nl_cursor;
 729                         ret = SWAP_AGAIN;
 730                 }
 731                 cond_resched_lock(&mapping->i_mmap_lock);
 732                 max_nl_cursor += CLUSTER_SIZE;
 733         } while (max_nl_cursor <= max_nl_size);
 734
 735         /*
 736          * Don't loop forever (perhaps all the remaining pages are
 737          * in locked vmas).  Reset cursor on all unreserved nonlinear
 738          * vmas, now forgetting on which ones it had fallen behind.
 739          */
 740         list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 741                                                 shared.vm_set.list) {
 742                 if (!(vma->vm_flags & VM_RESERVED))
 743                         vma->vm_private_data = NULL;
 744         }
 745 relock:
 746         page_map_lock(page);
 747 out:
 748         spin_unlock(&mapping->i_mmap_lock);
 749         return ret;
 750 }
 751
 752 /**
 753  * try_to_unmap - try to remove all page table mappings to a page
 754  * @page: the page to get unmapped
 755  *
 756  * Tries to remove all the page table entries which are mapping this
 757  * page, used in the pageout path.  Caller must hold the page lock
 758  * and its rmap lock.  Return values are:
 759  *
 760  * SWAP_SUCCESS - we succeeded in removing all mappings
 761  * SWAP_AGAIN   - we missed a trylock, try again later
 762  * SWAP_FAIL    - the page is unswappable
 763  */
 764 int try_to_unmap(struct page *page)
 765 {
 766         int ret;
 767
 768         BUG_ON(PageReserved(page));
 769         BUG_ON(!PageLocked(page));
 770         BUG_ON(!page->mapcount);
 771
 772         if (PageAnon(page))
 773                 ret = try_to_unmap_anon(page);
 774         else
 775                 ret = try_to_unmap_file(page);
 776
 777         if (!page->mapcount) {
 778                 if (page_test_and_clear_dirty(page))
 779                         set_page_dirty(page);
 780                 if (PageAnon(page))
 781                         clear_page_anon(page);
 782                 dec_page_state(nr_mapped);
 783                 ret = SWAP_SUCCESS;
 784         }
 785         return ret;
 786 }