2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
7 * Simple, low overhead reverse mapping scheme.
8 * Please try to keep this thing as modular as possible.
10 * Provides methods for unmapping each kind of mapped page:
11 * the anon methods track anonymous pages, and
12 * the file methods track pages belonging to an inode.
14 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17 * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004
22 * - the page->mapcount field is protected by the PG_maplock bit,
23 * which nests within the mm->page_table_lock,
24 * which nests within the page lock.
25 * - because swapout locking is opposite to the locking order
26 * in the page fault path, the swapout path uses trylocks
27 * on the mm->page_table_lock
30 #include <linux/pagemap.h>
31 #include <linux/swap.h>
32 #include <linux/swapops.h>
33 #include <linux/slab.h>
34 #include <linux/init.h>
35 #include <linux/rmap.h>
36 #include <linux/vs_memory.h>
38 #include <asm/tlbflush.h>
40 //#define RMAP_DEBUG /* can be enabled only for debugging */
42 kmem_cache_t *anon_vma_cachep;
44 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
47 struct anon_vma *anon_vma = find_vma->anon_vma;
48 struct vm_area_struct *vma;
49 unsigned int mapcount = 0;
52 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
54 BUG_ON(mapcount > 100000);
62 /* This must be called under the mmap_sem. */
63 int anon_vma_prepare(struct vm_area_struct *vma)
65 struct anon_vma *anon_vma = vma->anon_vma;
68 if (unlikely(!anon_vma)) {
69 struct mm_struct *mm = vma->vm_mm;
70 struct anon_vma *allocated = NULL;
72 anon_vma = find_mergeable_anon_vma(vma);
74 anon_vma = anon_vma_alloc();
75 if (unlikely(!anon_vma))
80 /* page_table_lock to protect against threads */
81 spin_lock(&mm->page_table_lock);
82 if (likely(!vma->anon_vma)) {
83 vma->anon_vma = anon_vma;
84 list_add(&vma->anon_vma_node, &anon_vma->head);
87 spin_unlock(&mm->page_table_lock);
88 if (unlikely(allocated))
89 anon_vma_free(allocated);
94 void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
97 BUG_ON(!next->anon_vma);
98 vma->anon_vma = next->anon_vma;
99 list_add(&vma->anon_vma_node, &next->anon_vma_node);
101 /* if they're both non-null they must be the same */
102 BUG_ON(vma->anon_vma != next->anon_vma);
104 list_del(&next->anon_vma_node);
107 void __anon_vma_link(struct vm_area_struct *vma)
109 struct anon_vma *anon_vma = vma->anon_vma;
112 list_add(&vma->anon_vma_node, &anon_vma->head);
113 validate_anon_vma(vma);
117 void anon_vma_link(struct vm_area_struct *vma)
119 struct anon_vma *anon_vma = vma->anon_vma;
122 spin_lock(&anon_vma->lock);
123 list_add(&vma->anon_vma_node, &anon_vma->head);
124 validate_anon_vma(vma);
125 spin_unlock(&anon_vma->lock);
129 void anon_vma_unlink(struct vm_area_struct *vma)
131 struct anon_vma *anon_vma = vma->anon_vma;
137 spin_lock(&anon_vma->lock);
138 validate_anon_vma(vma);
139 list_del(&vma->anon_vma_node);
141 /* We must garbage collect the anon_vma if it's empty */
142 empty = list_empty(&anon_vma->head);
143 spin_unlock(&anon_vma->lock);
146 anon_vma_free(anon_vma);
149 static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
151 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
152 SLAB_CTOR_CONSTRUCTOR) {
153 struct anon_vma *anon_vma = data;
155 spin_lock_init(&anon_vma->lock);
156 INIT_LIST_HEAD(&anon_vma->head);
160 void __init anon_vma_init(void)
162 anon_vma_cachep = kmem_cache_create("anon_vma",
163 sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
166 /* this needs the page->flags PG_maplock held */
167 static inline void clear_page_anon(struct page *page)
169 BUG_ON(!page->mapping);
170 page->mapping = NULL;
175 * At what user virtual address is page expected in vma?
177 static inline unsigned long
178 vma_address(struct page *page, struct vm_area_struct *vma)
180 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
181 unsigned long address;
183 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
184 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
185 /* page should be within any vma from prio_tree_next */
186 BUG_ON(!PageAnon(page));
193 * Subfunctions of page_referenced: page_referenced_one called
194 * repeatedly from either page_referenced_anon or page_referenced_file.
196 static int page_referenced_one(struct page *page,
197 struct vm_area_struct *vma, unsigned int *mapcount)
199 struct mm_struct *mm = vma->vm_mm;
200 unsigned long address;
208 address = vma_address(page, vma);
209 if (address == -EFAULT)
212 if (!spin_trylock(&mm->page_table_lock))
215 pgd = pgd_offset(mm, address);
216 if (!pgd_present(*pgd))
219 pmd = pmd_offset(pgd, address);
220 if (!pmd_present(*pmd))
223 pte = pte_offset_map(pmd, address);
224 if (!pte_present(*pte))
227 if (page_to_pfn(page) != pte_pfn(*pte))
230 if (ptep_clear_flush_young(vma, address, pte))
238 spin_unlock(&mm->page_table_lock);
243 static inline int page_referenced_anon(struct page *page)
245 unsigned int mapcount = page->mapcount;
246 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
247 struct vm_area_struct *vma;
250 spin_lock(&anon_vma->lock);
251 BUG_ON(list_empty(&anon_vma->head));
252 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
253 referenced += page_referenced_one(page, vma, &mapcount);
257 spin_unlock(&anon_vma->lock);
262 * page_referenced_file - referenced check for object-based rmap
263 * @page: the page we're checking references on.
265 * For an object-based mapped page, find all the places it is mapped and
266 * check/clear the referenced flag. This is done by following the page->mapping
267 * pointer, then walking the chain of vmas it holds. It returns the number
268 * of references it found.
270 * This function is only called from page_referenced for object-based pages.
272 * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
273 * assume a reference count of 0, so try_to_unmap will then have a go.
275 static inline int page_referenced_file(struct page *page)
277 unsigned int mapcount = page->mapcount;
278 struct address_space *mapping = page->mapping;
279 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
280 struct vm_area_struct *vma = NULL;
281 struct prio_tree_iter iter;
284 if (!spin_trylock(&mapping->i_mmap_lock))
287 while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
288 &iter, pgoff, pgoff)) != NULL) {
289 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
290 == (VM_LOCKED|VM_MAYSHARE)) {
294 referenced += page_referenced_one(page, vma, &mapcount);
299 spin_unlock(&mapping->i_mmap_lock);
304 * page_referenced - test if the page was referenced
305 * @page: the page to test
307 * Quick test_and_clear_referenced for all mappings to a page,
308 * returns the number of ptes which referenced the page.
309 * Caller needs to hold the rmap lock.
311 int page_referenced(struct page *page)
315 if (page_test_and_clear_young(page))
318 if (TestClearPageReferenced(page))
321 if (page->mapcount && page->mapping) {
323 referenced += page_referenced_anon(page);
325 referenced += page_referenced_file(page);
331 * page_add_anon_rmap - add pte mapping to an anonymous page
332 * @page: the page to add the mapping to
333 * @vma: the vm area in which the mapping is added
334 * @address: the user virtual address mapped
336 * The caller needs to hold the mm->page_table_lock.
338 void page_add_anon_rmap(struct page *page,
339 struct vm_area_struct *vma, unsigned long address)
341 struct anon_vma *anon_vma = vma->anon_vma;
344 BUG_ON(PageReserved(page));
347 index = (address - vma->vm_start) >> PAGE_SHIFT;
348 index += vma->vm_pgoff;
349 index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
352 * Setting and clearing PG_anon must always happen inside
353 * page_map_lock to avoid races between mapping and
354 * unmapping on different processes of the same
355 * shared cow swapcache page. And while we take the
356 * page_map_lock PG_anon cannot change from under us.
357 * Actually PG_anon cannot change under fork either
358 * since fork holds a reference on the page so it cannot
359 * be unmapped under fork and in turn copy_page_range is
360 * allowed to read PG_anon outside the page_map_lock.
363 if (!page->mapcount) {
364 BUG_ON(PageAnon(page));
365 BUG_ON(page->mapping);
368 page->mapping = (struct address_space *) anon_vma;
369 inc_page_state(nr_mapped);
371 BUG_ON(!PageAnon(page));
372 BUG_ON(page->index != index);
373 BUG_ON(page->mapping != (struct address_space *) anon_vma);
376 page_map_unlock(page);
380 * page_add_file_rmap - add pte mapping to a file page
381 * @page: the page to add the mapping to
383 * The caller needs to hold the mm->page_table_lock.
385 void page_add_file_rmap(struct page *page)
387 BUG_ON(PageAnon(page));
388 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
393 inc_page_state(nr_mapped);
395 page_map_unlock(page);
399 * page_remove_rmap - take down pte mapping from a page
400 * @page: page to remove mapping from
402 * Caller needs to hold the mm->page_table_lock.
404 void page_remove_rmap(struct page *page)
406 BUG_ON(PageReserved(page));
407 BUG_ON(!page->mapcount);
411 if (!page->mapcount) {
412 if (page_test_and_clear_dirty(page))
413 set_page_dirty(page);
415 clear_page_anon(page);
416 dec_page_state(nr_mapped);
418 page_map_unlock(page);
422 * Subfunctions of try_to_unmap: try_to_unmap_one called
423 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
425 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
427 struct mm_struct *mm = vma->vm_mm;
428 unsigned long address;
433 int ret = SWAP_AGAIN;
437 address = vma_address(page, vma);
438 if (address == -EFAULT)
442 * We need the page_table_lock to protect us from page faults,
443 * munmap, fork, etc...
445 if (!spin_trylock(&mm->page_table_lock))
448 pgd = pgd_offset(mm, address);
449 if (!pgd_present(*pgd))
452 pmd = pmd_offset(pgd, address);
453 if (!pmd_present(*pmd))
456 pte = pte_offset_map(pmd, address);
457 if (!pte_present(*pte))
460 if (page_to_pfn(page) != pte_pfn(*pte))
464 * If the page is mlock()d, we cannot swap it out.
465 * If it's recently referenced (perhaps page_referenced
466 * skipped over this mm) then we should reactivate it.
468 if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
469 ptep_clear_flush_young(vma, address, pte)) {
475 * Don't pull an anonymous page out from under get_user_pages.
476 * GUP carefully breaks COW and raises page count (while holding
477 * page_table_lock, as we have here) to make sure that the page
478 * cannot be freed. If we unmap that page here, a user write
479 * access to the virtual address will bring back the page, but
480 * its raised count will (ironically) be taken to mean it's not
481 * an exclusive swap page, do_wp_page will replace it by a copy
482 * page, and the user never get to see the data GUP was holding
483 * the original page for.
485 if (PageSwapCache(page) &&
486 page_count(page) != page->mapcount + 2) {
491 /* Nuke the page table entry. */
492 flush_cache_page(vma, address);
493 pteval = ptep_clear_flush(vma, address, pte);
495 /* Move the dirty bit to the physical page now the pte is gone. */
496 if (pte_dirty(pteval))
497 set_page_dirty(page);
499 if (PageAnon(page)) {
500 swp_entry_t entry = { .val = page->private };
502 * Store the swap location in the pte.
503 * See handle_pte_fault() ...
505 BUG_ON(!PageSwapCache(page));
506 swap_duplicate(entry);
507 set_pte(pte, swp_entry_to_pte(entry));
508 BUG_ON(pte_file(*pte));
513 BUG_ON(!page->mapcount);
515 page_cache_release(page);
520 spin_unlock(&mm->page_table_lock);
526 * objrmap doesn't work for nonlinear VMAs because the assumption that
527 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
528 * Consequently, given a particular page and its ->index, we cannot locate the
529 * ptes which are mapping that page without an exhaustive linear search.
531 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
532 * maps the file to which the target page belongs. The ->vm_private_data field
533 * holds the current cursor into that scan. Successive searches will circulate
534 * around the vma's virtual address space.
536 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
537 * more scanning pressure is placed against them as well. Eventually pages
538 * will become fully unmapped and are eligible for eviction.
540 * For very sparsely populated VMAs this is a little inefficient - chances are
541 * there there won't be many ptes located within the scan cluster. In this case
542 * maybe we could scan further - to the end of the pte page, perhaps.
544 #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
545 #define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
547 static int try_to_unmap_cluster(unsigned long cursor,
548 unsigned int *mapcount, struct vm_area_struct *vma)
550 struct mm_struct *mm = vma->vm_mm;
556 unsigned long address;
561 * We need the page_table_lock to protect us from page faults,
562 * munmap, fork, etc...
564 if (!spin_trylock(&mm->page_table_lock))
567 address = (vma->vm_start + cursor) & CLUSTER_MASK;
568 end = address + CLUSTER_SIZE;
569 if (address < vma->vm_start)
570 address = vma->vm_start;
571 if (end > vma->vm_end)
574 pgd = pgd_offset(mm, address);
575 if (!pgd_present(*pgd))
578 pmd = pmd_offset(pgd, address);
579 if (!pmd_present(*pmd))
582 for (pte = pte_offset_map(pmd, address);
583 address < end; pte++, address += PAGE_SIZE) {
585 if (!pte_present(*pte))
592 page = pfn_to_page(pfn);
593 BUG_ON(PageAnon(page));
594 if (PageReserved(page))
597 if (ptep_clear_flush_young(vma, address, pte))
600 /* Nuke the page table entry. */
601 flush_cache_page(vma, address);
602 pteval = ptep_clear_flush(vma, address, pte);
604 /* If nonlinear, store the file page offset in the pte. */
605 if (page->index != linear_page_index(vma, address))
606 set_pte(pte, pgoff_to_pte(page->index));
608 /* Move the dirty bit to the physical page now the pte is gone. */
609 if (pte_dirty(pteval))
610 set_page_dirty(page);
612 page_remove_rmap(page);
613 page_cache_release(page);
621 spin_unlock(&mm->page_table_lock);
625 static inline int try_to_unmap_anon(struct page *page)
627 struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
628 struct vm_area_struct *vma;
629 int ret = SWAP_AGAIN;
631 spin_lock(&anon_vma->lock);
632 BUG_ON(list_empty(&anon_vma->head));
633 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
634 ret = try_to_unmap_one(page, vma);
635 if (ret == SWAP_FAIL || !page->mapcount)
638 spin_unlock(&anon_vma->lock);
643 * try_to_unmap_file - unmap file page using the object-based rmap method
644 * @page: the page to unmap
646 * Find all the mappings of a page using the mapping pointer and the vma chains
647 * contained in the address_space struct it points to.
649 * This function is only called from try_to_unmap for object-based pages.
651 * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
652 * return a temporary error.
654 static inline int try_to_unmap_file(struct page *page)
656 struct address_space *mapping = page->mapping;
657 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
658 struct vm_area_struct *vma = NULL;
659 struct prio_tree_iter iter;
660 int ret = SWAP_AGAIN;
661 unsigned long cursor;
662 unsigned long max_nl_cursor = 0;
663 unsigned long max_nl_size = 0;
664 unsigned int mapcount;
666 if (!spin_trylock(&mapping->i_mmap_lock))
669 while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap,
670 &iter, pgoff, pgoff)) != NULL) {
671 ret = try_to_unmap_one(page, vma);
672 if (ret == SWAP_FAIL || !page->mapcount)
676 if (list_empty(&mapping->i_mmap_nonlinear))
679 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
680 shared.vm_set.list) {
681 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
683 cursor = (unsigned long) vma->vm_private_data;
684 if (cursor > max_nl_cursor)
685 max_nl_cursor = cursor;
686 cursor = vma->vm_end - vma->vm_start;
687 if (cursor > max_nl_size)
688 max_nl_size = cursor;
691 if (max_nl_size == 0) /* any nonlinears locked or reserved */
695 * We don't try to search for this page in the nonlinear vmas,
696 * and page_referenced wouldn't have found it anyway. Instead
697 * just walk the nonlinear vmas trying to age and unmap some.
698 * The mapcount of the page we came in with is irrelevant,
699 * but even so use it as a guide to how hard we should try?
701 mapcount = page->mapcount;
702 page_map_unlock(page);
703 cond_resched_lock(&mapping->i_mmap_lock);
705 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
706 if (max_nl_cursor == 0)
707 max_nl_cursor = CLUSTER_SIZE;
710 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
711 shared.vm_set.list) {
712 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
714 cursor = (unsigned long) vma->vm_private_data;
715 while (vma->vm_mm->rss &&
716 cursor < max_nl_cursor &&
717 cursor < vma->vm_end - vma->vm_start) {
718 ret = try_to_unmap_cluster(
719 cursor, &mapcount, vma);
720 if (ret == SWAP_FAIL)
722 cursor += CLUSTER_SIZE;
723 vma->vm_private_data = (void *) cursor;
724 if ((int)mapcount <= 0)
727 if (ret != SWAP_FAIL)
728 vma->vm_private_data =
729 (void *) max_nl_cursor;
732 cond_resched_lock(&mapping->i_mmap_lock);
733 max_nl_cursor += CLUSTER_SIZE;
734 } while (max_nl_cursor <= max_nl_size);
737 * Don't loop forever (perhaps all the remaining pages are
738 * in locked vmas). Reset cursor on all unreserved nonlinear
739 * vmas, now forgetting on which ones it had fallen behind.
741 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
742 shared.vm_set.list) {
743 if (!(vma->vm_flags & VM_RESERVED))
744 vma->vm_private_data = 0;
749 spin_unlock(&mapping->i_mmap_lock);
754 * try_to_unmap - try to remove all page table mappings to a page
755 * @page: the page to get unmapped
757 * Tries to remove all the page table entries which are mapping this
758 * page, used in the pageout path. Caller must hold the page lock
759 * and its rmap lock. Return values are:
761 * SWAP_SUCCESS - we succeeded in removing all mappings
762 * SWAP_AGAIN - we missed a trylock, try again later
763 * SWAP_FAIL - the page is unswappable
765 int try_to_unmap(struct page *page)
769 BUG_ON(PageReserved(page));
770 BUG_ON(!PageLocked(page));
771 BUG_ON(!page->mapcount);
774 ret = try_to_unmap_anon(page);
776 ret = try_to_unmap_file(page);
778 if (!page->mapcount) {
779 if (page_test_and_clear_dirty(page))
780 set_page_dirty(page);
782 clear_page_anon(page);
783 dec_page_state(nr_mapped);