2 * mm/rmap.c - physical to virtual reverse mappings
4 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5 * Released under the General Public License (GPL).
8 * Simple, low overhead pte-based reverse mapping scheme.
9 * This is kept modular because we may want to experiment
10 * with object-based reverse mapping schemes. Please try
11 * to keep this thing as modular as possible.
16 * - the page->pte.chain is protected by the PG_maplock bit,
17 * which nests within the the mm->page_table_lock,
18 * which nests within the page lock.
19 * - because swapout locking is opposite to the locking order
20 * in the page fault path, the swapout path uses trylocks
21 * on the mm->page_table_lock
24 #include <linux/pagemap.h>
25 #include <linux/swap.h>
26 #include <linux/swapops.h>
27 #include <linux/slab.h>
28 #include <linux/init.h>
29 #include <linux/rmap.h>
30 #include <linux/cache.h>
31 #include <linux/percpu.h>
33 #include <asm/pgalloc.h>
36 #include <asm/tlbflush.h>
39 * Something oopsable to put for now in the page->mapping
40 * of an anonymous page, to test that it is ignored.
42 #define ANON_MAPPING_DEBUG ((struct address_space *) 0)
44 static inline void clear_page_anon(struct page *page)
46 BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
52 * Shared pages have a chain of pte_chain structures, used to locate
53 * all the mappings to this page. We only need a pointer to the pte
54 * here, the page struct for the page table page contains the process
55 * it belongs to and the offset within that process.
57 * We use an array of pte pointers in this structure to minimise cache misses
58 * while traversing reverse maps.
60 #define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t))
63 * next_and_idx encodes both the address of the next pte_chain and the
64 * offset of the lowest-index used pte in ptes[] (which is equal also
65 * to the offset of the highest-index unused pte in ptes[], plus one).
68 unsigned long next_and_idx;
69 pte_addr_t ptes[NRPTE];
70 } ____cacheline_aligned;
72 kmem_cache_t *pte_chain_cache;
74 static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain)
76 return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE);
79 static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr)
81 return (struct pte_chain *)(pte_chain_addr & ~NRPTE);
84 static inline int pte_chain_idx(struct pte_chain *pte_chain)
86 return pte_chain->next_and_idx & NRPTE;
89 static inline unsigned long
90 pte_chain_encode(struct pte_chain *pte_chain, int idx)
92 return (unsigned long)pte_chain | idx;
96 * pte_chain list management policy:
98 * - If a page has a pte_chain list then it is shared by at least two processes,
99 * because a single sharing uses PageDirect. (Well, this isn't true yet,
100 * coz this code doesn't collapse singletons back to PageDirect on the remove
102 * - A pte_chain list has free space only in the head member - all succeeding
103 * members are 100% full.
104 * - If the head element has free space, it occurs in its leading slots.
105 * - All free space in the pte_chain is at the start of the head member.
106 * - Insertion into the pte_chain puts a pte pointer in the last free slot of
108 * - Removal from a pte chain moves the head pte of the head member onto the
109 * victim pte and frees the head member if it became empty.
113 ** VM stuff below this comment
117 * page_referenced - test if the page was referenced
118 * @page: the page to test
120 * Quick test_and_clear_referenced for all mappings to a page,
121 * returns the number of processes which referenced the page.
122 * Caller needs to hold the rmap lock.
124 * If the page has a single-entry pte_chain, collapse that back to a PageDirect
125 * representation. This way, it's only done under memory pressure.
127 int fastcall page_referenced(struct page * page)
129 struct pte_chain *pc;
132 if (page_test_and_clear_young(page))
135 if (TestClearPageReferenced(page))
138 if (PageDirect(page)) {
139 pte_t *pte = rmap_ptep_map(page->pte.direct);
140 if (ptep_test_and_clear_young(pte))
142 rmap_ptep_unmap(pte);
146 /* Check all the page tables mapping this page. */
147 for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) {
150 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
151 pte_addr_t pte_paddr = pc->ptes[i];
154 p = rmap_ptep_map(pte_paddr);
155 if (ptep_test_and_clear_young(p))
161 if (nr_chains == 1) {
162 pc = page->pte.chain;
163 page->pte.direct = pc->ptes[NRPTE-1];
165 pc->ptes[NRPTE-1] = 0;
166 __pte_chain_free(pc);
173 * page_add_rmap - add reverse mapping entry to a page
174 * @page: the page to add the mapping to
175 * @ptep: the page table entry mapping this page
177 * Add a new pte reverse mapping to a page.
178 * The caller needs to hold the mm->page_table_lock.
180 struct pte_chain * fastcall
181 page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
183 pte_addr_t pte_paddr = ptep_to_paddr(ptep);
184 struct pte_chain *cur_pte_chain;
186 if (PageReserved(page))
191 if (page->pte.direct == 0) {
192 page->pte.direct = pte_paddr;
194 if (!page->mapping) {
196 page->mapping = ANON_MAPPING_DEBUG;
198 inc_page_state(nr_mapped);
202 if (PageDirect(page)) {
203 /* Convert a direct pointer into a pte_chain */
204 ClearPageDirect(page);
205 pte_chain->ptes[NRPTE-1] = page->pte.direct;
206 pte_chain->ptes[NRPTE-2] = pte_paddr;
207 pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2);
208 page->pte.direct = 0;
209 page->pte.chain = pte_chain;
210 pte_chain = NULL; /* We consumed it */
214 cur_pte_chain = page->pte.chain;
215 if (cur_pte_chain->ptes[0]) { /* It's full */
216 pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain,
218 page->pte.chain = pte_chain;
219 pte_chain->ptes[NRPTE-1] = pte_paddr;
220 pte_chain = NULL; /* We consumed it */
223 cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
224 cur_pte_chain->next_and_idx--;
231 * page_remove_rmap - take down reverse mapping to a page
232 * @page: page to remove mapping from
233 * @ptep: page table entry to remove
235 * Removes the reverse mapping from the pte_chain of the page,
236 * after that the caller can clear the page table entry and free
238 * Caller needs to hold the mm->page_table_lock.
240 void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
242 pte_addr_t pte_paddr = ptep_to_paddr(ptep);
243 struct pte_chain *pc;
245 if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
250 if (!page_mapped(page))
251 goto out_unlock; /* remap_page_range() from a driver? */
253 if (PageDirect(page)) {
254 if (page->pte.direct == pte_paddr) {
255 page->pte.direct = 0;
256 ClearPageDirect(page);
260 struct pte_chain *start = page->pte.chain;
261 struct pte_chain *next;
262 int victim_i = pte_chain_idx(start);
264 for (pc = start; pc; pc = next) {
267 next = pte_chain_next(pc);
270 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
271 pte_addr_t pa = pc->ptes[i];
275 pc->ptes[i] = start->ptes[victim_i];
276 start->ptes[victim_i] = 0;
277 if (victim_i == NRPTE-1) {
278 /* Emptied a pte_chain */
279 page->pte.chain = pte_chain_next(start);
280 __pte_chain_free(start);
282 start->next_and_idx++;
289 if (!page_mapped(page)) {
290 if (page_test_and_clear_dirty(page))
291 set_page_dirty(page);
293 clear_page_anon(page);
294 dec_page_state(nr_mapped);
301 * try_to_unmap_one - worker function for try_to_unmap
302 * @page: page to unmap
303 * @ptep: page table entry to unmap from page
305 * Internal helper function for try_to_unmap, called for each page
306 * table entry mapping a page. Because locking order here is opposite
307 * to the locking order used by the page fault path, we use trylocks.
309 * page lock shrink_list(), trylock
310 * rmap lock shrink_list()
311 * mm->page_table_lock try_to_unmap_one(), trylock
313 static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
315 pte_t *ptep = rmap_ptep_map(paddr);
316 unsigned long address = ptep_to_address(ptep);
317 struct mm_struct * mm = ptep_to_mm(ptep);
318 struct vm_area_struct * vma;
326 * We need the page_table_lock to protect us from page faults,
327 * munmap, fork, etc...
329 if (!spin_trylock(&mm->page_table_lock)) {
330 rmap_ptep_unmap(ptep);
334 /* unmap_vmas drops page_table_lock with vma unlinked */
335 vma = find_vma(mm, address);
341 /* The page is mlock()d, we cannot swap it out. */
342 if (vma->vm_flags & VM_LOCKED) {
347 /* Nuke the page table entry. */
348 flush_cache_page(vma, address);
349 pte = ptep_clear_flush(vma, address, ptep);
351 if (PageAnon(page)) {
352 swp_entry_t entry = { .val = page->private };
354 * Store the swap location in the pte.
355 * See handle_pte_fault() ...
357 BUG_ON(!PageSwapCache(page));
358 swap_duplicate(entry);
359 set_pte(ptep, swp_entry_to_pte(entry));
360 BUG_ON(pte_file(*ptep));
363 * If a nonlinear mapping then store the file page offset
366 BUG_ON(!page->mapping);
367 if (page->index != linear_page_index(vma, address)) {
368 set_pte(ptep, pgoff_to_pte(page->index));
369 BUG_ON(!pte_file(*ptep));
373 /* Move the dirty bit to the physical page now the pte is gone. */
375 set_page_dirty(page);
378 page_cache_release(page);
382 rmap_ptep_unmap(ptep);
383 spin_unlock(&mm->page_table_lock);
388 * try_to_unmap - try to remove all page table mappings to a page
389 * @page: the page to get unmapped
391 * Tries to remove all the page table entries which are mapping this
392 * page, used in the pageout path. Caller must hold the page lock
393 * and its rmap lock. Return values are:
395 * SWAP_SUCCESS - we succeeded in removing all mappings
396 * SWAP_AGAIN - we missed a trylock, try again later
397 * SWAP_FAIL - the page is unswappable
399 int fastcall try_to_unmap(struct page * page)
401 struct pte_chain *pc, *next_pc, *start;
402 int ret = SWAP_SUCCESS;
405 /* This page should not be on the pageout lists. */
406 if (PageReserved(page))
408 if (!PageLocked(page))
411 if (PageDirect(page)) {
412 ret = try_to_unmap_one(page, page->pte.direct);
413 if (ret == SWAP_SUCCESS) {
414 page->pte.direct = 0;
415 ClearPageDirect(page);
420 start = page->pte.chain;
421 victim_i = pte_chain_idx(start);
422 for (pc = start; pc; pc = next_pc) {
425 next_pc = pte_chain_next(pc);
428 for (i = pte_chain_idx(pc); i < NRPTE; i++) {
429 pte_addr_t pte_paddr = pc->ptes[i];
431 switch (try_to_unmap_one(page, pte_paddr)) {
434 * Release a slot. If we're releasing the
435 * first pte in the first pte_chain then
436 * pc->ptes[i] and start->ptes[victim_i] both
437 * refer to the same thing. It works out.
439 pc->ptes[i] = start->ptes[victim_i];
440 start->ptes[victim_i] = 0;
442 if (victim_i == NRPTE) {
443 page->pte.chain = pte_chain_next(start);
444 __pte_chain_free(start);
445 start = page->pte.chain;
448 start->next_and_idx++;
452 /* Skip this pte, remembering status. */
462 if (!page_mapped(page)) {
463 if (page_test_and_clear_dirty(page))
464 set_page_dirty(page);
466 clear_page_anon(page);
467 dec_page_state(nr_mapped);
474 ** No more VM stuff below this comment, only pte_chain helper
478 static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags)
480 struct pte_chain *pc = p;
482 memset(pc, 0, sizeof(*pc));
485 DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0;
488 * __pte_chain_free - free pte_chain structure
489 * @pte_chain: pte_chain struct to free
491 void __pte_chain_free(struct pte_chain *pte_chain)
493 struct pte_chain **pte_chainp;
495 pte_chainp = &get_cpu_var(local_pte_chain);
496 if (pte_chain->next_and_idx)
497 pte_chain->next_and_idx = 0;
499 kmem_cache_free(pte_chain_cache, *pte_chainp);
500 *pte_chainp = pte_chain;
501 put_cpu_var(local_pte_chain);
505 * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap().
507 * The caller of page_add_rmap() must perform the allocation because
508 * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap()
509 * will not actually use the pte_chain, because there is space available in one
510 * of the existing pte_chains which are attached to the page. So the case of
511 * allocating and then freeing a single pte_chain is specially optimised here,
512 * with a one-deep per-cpu cache.
514 struct pte_chain *pte_chain_alloc(int gfp_flags)
516 struct pte_chain *ret;
517 struct pte_chain **pte_chainp;
519 might_sleep_if(gfp_flags & __GFP_WAIT);
521 pte_chainp = &get_cpu_var(local_pte_chain);
525 put_cpu_var(local_pte_chain);
527 put_cpu_var(local_pte_chain);
528 ret = kmem_cache_alloc(pte_chain_cache, gfp_flags);
533 void __init pte_chain_init(void)
535 pte_chain_cache = kmem_cache_create( "pte_chain",
536 sizeof(struct pte_chain),
537 sizeof(struct pte_chain),
542 if (!pte_chain_cache)
543 panic("failed to create pte_chain cache!\n");