X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=mm%2Frmap.c;h=c5363bf4131df12ad4571dd2d033608ef7472746;hb=c7b5ebbddf7bcd3651947760f423e3783bbe6573;hp=c8e236e58e83614b2df4f5f60ffb7d814cae2acd;hpb=a2c21200f1c81b08cb55e417b68150bba439b646;p=linux-2.6.git diff --git a/mm/rmap.c b/mm/rmap.c index c8e236e58..c5363bf41 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -18,9 +18,30 @@ */ /* - * Locking: see "Lock ordering" summary in filemap.c. - * In swapout, page_map_lock is held on entry to page_referenced and - * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock. + * Lock ordering in mm: + * + * inode->i_sem (while writing or truncating, not reading or faulting) + * inode->i_alloc_sem + * + * When a page fault occurs in writing from user to file, down_read + * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within + * down_read of mmap_sem; i_sem and down_write of mmap_sem are never + * taken together; in truncation, i_sem is taken outermost. + * + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_lock + * anon_vma->lock + * mm->page_table_lock + * zone->lru_lock (in mark_page_accessed) + * swap_list_lock (in swap_free etc's swap_info_get) + * swap_device_lock (in swap_duplicate, swap_info_get) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within inode_lock in __sync_single_inode) */ #include @@ -30,6 +51,7 @@ #include #include #include +#include #include #include @@ -64,28 +86,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated = NULL; + struct anon_vma *allocated, *locked; anon_vma = find_mergeable_anon_vma(vma); - if (!anon_vma) { + if (anon_vma) { + allocated = NULL; + locked = anon_vma; + spin_lock(&locked->lock); + } else { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) return -ENOMEM; allocated = anon_vma; + locked = NULL; } /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { - if (!allocated) - spin_lock(&anon_vma->lock); vma->anon_vma = anon_vma; list_add(&vma->anon_vma_node, &anon_vma->head); - if (!allocated) - spin_unlock(&anon_vma->lock); allocated = NULL; } spin_unlock(&mm->page_table_lock); + + if (locked) + spin_unlock(&locked->lock); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -160,16 +186,31 @@ static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) void __init anon_vma_init(void) { - anon_vma_cachep = kmem_cache_create("anon_vma", - sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL); + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); } -/* this needs the page->flags PG_maplock held */ -static inline void clear_page_anon(struct page *page) +/* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +static struct anon_vma *page_lock_anon_vma(struct page *page) { - BUG_ON(!page->mapping); - page->mapping = NULL; - ClearPageAnon(page); + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long) page->mapping; + if (!(anon_mapping & PAGE_MAPPING_ANON)) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); +out: + rcu_read_unlock(); + return anon_vma; } /* @@ -190,6 +231,24 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +/* + * At what user virtual address is page expected in vma? checking that the + * page matches the vma: currently only used by unuse_process, on anon pages. + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + if (PageAnon(page)) { + if ((void *)vma->anon_vma != + (void *)page->mapping - PAGE_MAPPING_ANON) + return -EFAULT; + } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { + if (vma->vm_file->f_mapping != page->mapping) + return -EFAULT; + } else + return -EFAULT; + return vma_address(page, vma); +} + /* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. @@ -210,8 +269,7 @@ static int page_referenced_one(struct page *page, if (address == -EFAULT) goto out; - if (!spin_trylock(&mm->page_table_lock)) - goto out; + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -231,6 +289,9 @@ static int page_referenced_one(struct page *page, if (ptep_clear_flush_young(vma, address, pte)) referenced++; + if (mm != current->mm && has_swap_token(mm)) + referenced++; + (*mapcount)--; out_unmap: @@ -241,15 +302,18 @@ out: return referenced; } -static inline int page_referenced_anon(struct page *page) +static int page_referenced_anon(struct page *page) { - unsigned int mapcount = page->mapcount; - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; + unsigned int mapcount; + struct anon_vma *anon_vma; struct vm_area_struct *vma; int referenced = 0; - spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return referenced; + + mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) @@ -269,24 +333,40 @@ static inline int page_referenced_anon(struct page *page) * of references it found. * * This function is only called from page_referenced for object-based pages. - * - * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, - * assume a reference count of 0, so try_to_unmap will then have a go. */ -static inline int page_referenced_file(struct page *page) +static int page_referenced_file(struct page *page) { - unsigned int mapcount = page->mapcount; + unsigned int mapcount; struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; struct prio_tree_iter iter; int referenced = 0; - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; + /* + * The caller's checks on page->mapping and !PageAnon have made + * sure that this is a file page: the check for page->mapping + * excludes the case just before it gets set on an anon page. + */ + BUG_ON(PageAnon(page)); + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_lock. + */ + BUG_ON(!PageLocked(page)); - while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap, - &iter, pgoff, pgoff)) != NULL) { + spin_lock(&mapping->i_mmap_lock); + + /* + * i_mmap_lock does not stabilize mapcount at all, but mapcount + * is more likely to be accurate if we note it after spinning. + */ + mapcount = page_mapcount(page); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) == (VM_LOCKED|VM_MAYSHARE)) { referenced++; @@ -304,12 +384,12 @@ static inline int page_referenced_file(struct page *page) /** * page_referenced - test if the page was referenced * @page: the page to test + * @is_locked: caller holds lock on the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. - * Caller needs to hold the rmap lock. */ -int page_referenced(struct page *page) +int page_referenced(struct page *page, int is_locked) { int referenced = 0; @@ -319,11 +399,18 @@ int page_referenced(struct page *page) if (TestClearPageReferenced(page)) referenced++; - if (page->mapcount && page->mapping) { + if (page_mapped(page) && page->mapping) { if (PageAnon(page)) referenced += page_referenced_anon(page); - else + else if (is_locked) referenced += page_referenced_file(page); + else if (TestSetPageLocked(page)) + referenced++; + else { + if (page->mapping) + referenced += page_referenced_file(page); + unlock_page(page); + } } return referenced; } @@ -345,36 +432,17 @@ void page_add_anon_rmap(struct page *page, BUG_ON(PageReserved(page)); BUG_ON(!anon_vma); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; index = (address - vma->vm_start) >> PAGE_SHIFT; index += vma->vm_pgoff; index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - /* - * Setting and clearing PG_anon must always happen inside - * page_map_lock to avoid races between mapping and - * unmapping on different processes of the same - * shared cow swapcache page. And while we take the - * page_map_lock PG_anon cannot change from under us. - * Actually PG_anon cannot change under fork either - * since fork holds a reference on the page so it cannot - * be unmapped under fork and in turn copy_page_range is - * allowed to read PG_anon outside the page_map_lock. - */ - page_map_lock(page); - if (!page->mapcount) { - BUG_ON(PageAnon(page)); - BUG_ON(page->mapping); - SetPageAnon(page); + if (atomic_inc_and_test(&page->_mapcount)) { page->index = index; page->mapping = (struct address_space *) anon_vma; inc_page_state(nr_mapped); - } else { - BUG_ON(!PageAnon(page)); - BUG_ON(page->index != index); - BUG_ON(page->mapping != (struct address_space *) anon_vma); } - page->mapcount++; - page_map_unlock(page); + /* else checking page index and mapping is racy */ } /** @@ -389,11 +457,8 @@ void page_add_file_rmap(struct page *page) if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) return; - page_map_lock(page); - if (!page->mapcount) + if (atomic_inc_and_test(&page->_mapcount)) inc_page_state(nr_mapped); - page->mapcount++; - page_map_unlock(page); } /** @@ -405,18 +470,22 @@ void page_add_file_rmap(struct page *page) void page_remove_rmap(struct page *page) { BUG_ON(PageReserved(page)); - BUG_ON(!page->mapcount); - page_map_lock(page); - page->mapcount--; - if (!page->mapcount) { + if (atomic_add_negative(-1, &page->_mapcount)) { + BUG_ON(page_mapcount(page) < 0); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - if (PageAnon(page)) - clear_page_anon(page); dec_page_state(nr_mapped); } - page_map_unlock(page); } /* @@ -443,8 +512,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) - goto out; + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -488,7 +556,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) * ptes from being unmapped, so swapoff can make progress. */ if (PageSwapCache(page) && - page_count(page) != page->mapcount + 2) { + page_count(page) != page_mapcount(page) + 2) { ret = SWAP_FAIL; goto out_unmap; } @@ -515,8 +583,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) // mm->rss--; vx_rsspages_dec(mm); - BUG_ON(!page->mapcount); - page->mapcount--; + page_remove_rmap(page); page_cache_release(page); out_unmap: @@ -549,7 +616,7 @@ out: #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static int try_to_unmap_cluster(unsigned long cursor, +static void try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; @@ -566,8 +633,7 @@ static int try_to_unmap_cluster(unsigned long cursor, * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) - return SWAP_FAIL; + spin_lock(&mm->page_table_lock); address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -625,20 +691,21 @@ static int try_to_unmap_cluster(unsigned long cursor, out_unlock: spin_unlock(&mm->page_table_lock); - return SWAP_AGAIN; } -static inline int try_to_unmap_anon(struct page *page) +static int try_to_unmap_anon(struct page *page) { - struct anon_vma *anon_vma = (struct anon_vma *) page->mapping; + struct anon_vma *anon_vma; struct vm_area_struct *vma; int ret = SWAP_AGAIN; - spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return ret; + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { ret = try_to_unmap_one(page, vma); - if (ret == SWAP_FAIL || !page->mapcount) + if (ret == SWAP_FAIL || !page_mapped(page)) break; } spin_unlock(&anon_vma->lock); @@ -653,15 +720,12 @@ static inline int try_to_unmap_anon(struct page *page) * contained in the address_space struct it points to. * * This function is only called from try_to_unmap for object-based pages. - * - * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, - * return a temporary error. */ -static inline int try_to_unmap_file(struct page *page) +static int try_to_unmap_file(struct page *page) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma; struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; @@ -669,13 +733,10 @@ static inline int try_to_unmap_file(struct page *page) unsigned long max_nl_size = 0; unsigned int mapcount; - if (!spin_trylock(&mapping->i_mmap_lock)) - return ret; - - while ((vma = vma_prio_tree_next(vma, &mapping->i_mmap, - &iter, pgoff, pgoff)) != NULL) { + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma); - if (ret == SWAP_FAIL || !page->mapcount) + if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -694,8 +755,10 @@ static inline int try_to_unmap_file(struct page *page) max_nl_size = cursor; } - if (max_nl_size == 0) /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + ret = SWAP_FAIL; goto out; + } /* * We don't try to search for this page in the nonlinear vmas, @@ -704,8 +767,9 @@ static inline int try_to_unmap_file(struct page *page) * The mapcount of the page we came in with is irrelevant, * but even so use it as a guide to how hard we should try? */ - mapcount = page->mapcount; - page_map_unlock(page); + mapcount = page_mapcount(page); + if (!mapcount) + goto out; cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; @@ -721,19 +785,13 @@ static inline int try_to_unmap_file(struct page *page) while (vma->vm_mm->rss && cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - ret = try_to_unmap_cluster( - cursor, &mapcount, vma); - if (ret == SWAP_FAIL) - break; + try_to_unmap_cluster(cursor, &mapcount, vma); cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) - goto relock; + goto out; } - if (ret != SWAP_FAIL) - vma->vm_private_data = - (void *) max_nl_cursor; - ret = SWAP_AGAIN; + vma->vm_private_data = (void *) max_nl_cursor; } cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; @@ -749,8 +807,6 @@ static inline int try_to_unmap_file(struct page *page) if (!(vma->vm_flags & VM_RESERVED)) vma->vm_private_data = NULL; } -relock: - page_map_lock(page); out: spin_unlock(&mapping->i_mmap_lock); return ret; @@ -761,11 +817,11 @@ out: * @page: the page to get unmapped * * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its rmap lock. Return values are: + * page, used in the pageout path. Caller must hold the page lock. + * Return values are: * * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ int try_to_unmap(struct page *page) @@ -774,20 +830,13 @@ int try_to_unmap(struct page *page) BUG_ON(PageReserved(page)); BUG_ON(!PageLocked(page)); - BUG_ON(!page->mapcount); if (PageAnon(page)) ret = try_to_unmap_anon(page); else ret = try_to_unmap_file(page); - if (!page->mapcount) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (PageAnon(page)) - clear_page_anon(page); - dec_page_state(nr_mapped); + if (!page_mapped(page)) ret = SWAP_SUCCESS; - } return ret; }