ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / mm / swap_state.c
1 /*
2  *  linux/mm/swap_state.c
3  *
4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5  *  Swap reorganised 29.12.95, Stephen Tweedie
6  *
7  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
8  */
9
10 #include <linux/mm.h>
11 #include <linux/kernel_stat.h>
12 #include <linux/swap.h>
13 #include <linux/init.h>
14 #include <linux/pagemap.h>
15 #include <linux/backing-dev.h>
16
17 #include <asm/pgtable.h>
18
19 /*
20  * swapper_space is a fiction, retained to simplify the path through
21  * vmscan's shrink_list.  Only those fields initialized below are used.
22  */
23 static struct address_space_operations swap_aops = {
24         .writepage      = swap_writepage,
25 };
26
27 static struct backing_dev_info swap_backing_dev_info = {
28         .state          = 0,    /* uncongested */
29 };
30
31 struct address_space swapper_space = {
32         .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
33         .tree_lock      = SPIN_LOCK_UNLOCKED,
34         .nrpages        = 0,    /* total_swapcache_pages */
35         .a_ops          = &swap_aops,
36         .backing_dev_info = &swap_backing_dev_info,
37 };
38
39 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
40
41 static struct {
42         unsigned long add_total;
43         unsigned long del_total;
44         unsigned long find_success;
45         unsigned long find_total;
46         unsigned long noent_race;
47         unsigned long exist_race;
48 } swap_cache_info;
49
50 void show_swap_cache_info(void)
51 {
52         printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
53                 swap_cache_info.add_total, swap_cache_info.del_total,
54                 swap_cache_info.find_success, swap_cache_info.find_total,
55                 swap_cache_info.noent_race, swap_cache_info.exist_race);
56 }
57
58 /*
59  * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
60  * but sets SwapCache flag and private instead of mapping and index.
61  */
62 static int __add_to_swap_cache(struct page *page,
63                 swp_entry_t entry, int gfp_mask)
64 {
65         int error;
66
67         BUG_ON(PageSwapCache(page));
68         BUG_ON(PagePrivate(page));
69         error = radix_tree_preload(gfp_mask);
70         if (!error) {
71                 page_cache_get(page);
72                 spin_lock(&swapper_space.tree_lock);
73                 error = radix_tree_insert(&swapper_space.page_tree,
74                                                 entry.val, page);
75                 if (!error) {
76                         SetPageLocked(page);
77                         SetPageSwapCache(page);
78                         page->private = entry.val;
79                         total_swapcache_pages++;
80                         pagecache_acct(1);
81                 } else
82                         page_cache_release(page);
83                 spin_unlock(&swapper_space.tree_lock);
84                 radix_tree_preload_end();
85         }
86         return error;
87 }
88
89 static int add_to_swap_cache(struct page *page, swp_entry_t entry)
90 {
91         int error;
92
93         if (!swap_duplicate(entry)) {
94                 INC_CACHE_INFO(noent_race);
95                 return -ENOENT;
96         }
97         error = __add_to_swap_cache(page, entry, GFP_KERNEL);
98         /*
99          * Anon pages are already on the LRU, we don't run lru_cache_add here.
100          */
101         if (error) {
102                 swap_free(entry);
103                 if (error == -EEXIST)
104                         INC_CACHE_INFO(exist_race);
105                 return error;
106         }
107         INC_CACHE_INFO(add_total);
108         return 0;
109 }
110
111 /*
112  * This must be called only on pages that have
113  * been verified to be in the swap cache.
114  */
115 void __delete_from_swap_cache(struct page *page)
116 {
117         BUG_ON(!PageLocked(page));
118         BUG_ON(!PageSwapCache(page));
119         BUG_ON(PageWriteback(page));
120
121         radix_tree_delete(&swapper_space.page_tree, page->private);
122         page->private = 0;
123         ClearPageSwapCache(page);
124         total_swapcache_pages--;
125         pagecache_acct(-1);
126         INC_CACHE_INFO(del_total);
127 }
128
129 /**
130  * add_to_swap - allocate swap space for a page
131  * @page: page we want to move to swap
132  *
133  * Allocate swap space for the page and add the page to the
134  * swap cache.  Caller needs to hold the page lock. 
135  */
136 int add_to_swap(struct page * page)
137 {
138         swp_entry_t entry;
139         int pf_flags;
140         int err;
141
142         if (!PageLocked(page))
143                 BUG();
144
145         for (;;) {
146                 entry = get_swap_page();
147                 if (!entry.val)
148                         return 0;
149
150                 /* Radix-tree node allocations are performing
151                  * GFP_ATOMIC allocations under PF_MEMALLOC.  
152                  * They can completely exhaust the page allocator.  
153                  *
154                  * So PF_MEMALLOC is dropped here.  This causes the slab 
155                  * allocations to fail earlier, so radix-tree nodes will 
156                  * then be allocated from the mempool reserves.
157                  *
158                  * We're still using __GFP_HIGH for radix-tree node
159                  * allocations, so some of the emergency pools are available,
160                  * just not all of them.
161                  */
162
163                 pf_flags = current->flags;
164                 current->flags &= ~PF_MEMALLOC;
165
166                 /*
167                  * Add it to the swap cache and mark it dirty
168                  */
169                 err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
170
171                 if (pf_flags & PF_MEMALLOC)
172                         current->flags |= PF_MEMALLOC;
173
174                 switch (err) {
175                 case 0:                         /* Success */
176                         SetPageUptodate(page);
177                         SetPageDirty(page);
178                         INC_CACHE_INFO(add_total);
179                         return 1;
180                 case -EEXIST:
181                         /* Raced with "speculative" read_swap_cache_async */
182                         INC_CACHE_INFO(exist_race);
183                         swap_free(entry);
184                         continue;
185                 default:
186                         /* -ENOMEM radix-tree allocation failure */
187                         swap_free(entry);
188                         return 0;
189                 }
190         }
191 }
192
193 /*
194  * This must be called only on pages that have
195  * been verified to be in the swap cache and locked.
196  * It will never put the page into the free list,
197  * the caller has a reference on the page.
198  */
199 void delete_from_swap_cache(struct page *page)
200 {
201         swp_entry_t entry;
202
203         BUG_ON(!PageSwapCache(page));
204         BUG_ON(!PageLocked(page));
205         BUG_ON(PageWriteback(page));
206         BUG_ON(PagePrivate(page));
207   
208         entry.val = page->private;
209
210         spin_lock(&swapper_space.tree_lock);
211         __delete_from_swap_cache(page);
212         spin_unlock(&swapper_space.tree_lock);
213
214         swap_free(entry);
215         page_cache_release(page);
216 }
217
218 /*
219  * Strange swizzling function only for use by shmem_writepage
220  */
221 int move_to_swap_cache(struct page *page, swp_entry_t entry)
222 {
223         int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
224         if (!err) {
225                 remove_from_page_cache(page);
226                 page_cache_release(page);       /* pagecache ref */
227                 if (!swap_duplicate(entry))
228                         BUG();
229                 SetPageDirty(page);
230                 INC_CACHE_INFO(add_total);
231         } else if (err == -EEXIST)
232                 INC_CACHE_INFO(exist_race);
233         return err;
234 }
235
236 /*
237  * Strange swizzling function for shmem_getpage (and shmem_unuse)
238  */
239 int move_from_swap_cache(struct page *page, unsigned long index,
240                 struct address_space *mapping)
241 {
242         int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
243         if (!err) {
244                 delete_from_swap_cache(page);
245                 /* shift page from clean_pages to dirty_pages list */
246                 ClearPageDirty(page);
247                 set_page_dirty(page);
248         }
249         return err;
250 }
251
252 /* 
253  * If we are the only user, then try to free up the swap cache. 
254  * 
255  * Its ok to check for PageSwapCache without the page lock
256  * here because we are going to recheck again inside 
257  * exclusive_swap_page() _with_ the lock. 
258  *                                      - Marcelo
259  */
260 static inline void free_swap_cache(struct page *page)
261 {
262         if (PageSwapCache(page) && !TestSetPageLocked(page)) {
263                 remove_exclusive_swap_page(page);
264                 unlock_page(page);
265         }
266 }
267
268 /* 
269  * Perform a free_page(), also freeing any swap cache associated with
270  * this page if it is the last user of the page. Can not do a lock_page,
271  * as we are holding the page_table_lock spinlock.
272  */
273 void free_page_and_swap_cache(struct page *page)
274 {
275         free_swap_cache(page);
276         page_cache_release(page);
277 }
278
279 /*
280  * Passed an array of pages, drop them all from swapcache and then release
281  * them.  They are removed from the LRU and freed if this is their last use.
282  */
283 void free_pages_and_swap_cache(struct page **pages, int nr)
284 {
285         int chunk = 16;
286         struct page **pagep = pages;
287
288         lru_add_drain();
289         while (nr) {
290                 int todo = min(chunk, nr);
291                 int i;
292
293                 for (i = 0; i < todo; i++)
294                         free_swap_cache(pagep[i]);
295                 release_pages(pagep, todo, 0);
296                 pagep += todo;
297                 nr -= todo;
298         }
299 }
300
301 /*
302  * Lookup a swap entry in the swap cache. A found page will be returned
303  * unlocked and with its refcount incremented - we rely on the kernel
304  * lock getting page table operations atomic even if we drop the page
305  * lock before returning.
306  */
307 struct page * lookup_swap_cache(swp_entry_t entry)
308 {
309         struct page *page;
310
311         spin_lock(&swapper_space.tree_lock);
312         page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
313         if (page) {
314                 page_cache_get(page);
315                 INC_CACHE_INFO(find_success);
316         }
317         spin_unlock(&swapper_space.tree_lock);
318         INC_CACHE_INFO(find_total);
319         return page;
320 }
321
322 /* 
323  * Locate a page of swap in physical memory, reserving swap cache space
324  * and reading the disk if it is not already cached.
325  * A failure return means that either the page allocation failed or that
326  * the swap entry is no longer in use.
327  */
328 struct page * read_swap_cache_async(swp_entry_t entry)
329 {
330         struct page *found_page, *new_page = NULL;
331         int err;
332
333         do {
334                 /*
335                  * First check the swap cache.  Since this is normally
336                  * called after lookup_swap_cache() failed, re-calling
337                  * that would confuse statistics.
338                  */
339                 spin_lock(&swapper_space.tree_lock);
340                 found_page = radix_tree_lookup(&swapper_space.page_tree,
341                                                 entry.val);
342                 if (found_page)
343                         page_cache_get(found_page);
344                 spin_unlock(&swapper_space.tree_lock);
345                 if (found_page)
346                         break;
347
348                 /*
349                  * Get a new page to read into from swap.
350                  */
351                 if (!new_page) {
352                         new_page = alloc_page(GFP_HIGHUSER);
353                         if (!new_page)
354                                 break;          /* Out of memory */
355                 }
356
357                 /*
358                  * Associate the page with swap entry in the swap cache.
359                  * May fail (-ENOENT) if swap entry has been freed since
360                  * our caller observed it.  May fail (-EEXIST) if there
361                  * is already a page associated with this entry in the
362                  * swap cache: added by a racing read_swap_cache_async,
363                  * or by try_to_swap_out (or shmem_writepage) re-using
364                  * the just freed swap entry for an existing page.
365                  * May fail (-ENOMEM) if radix-tree node allocation failed.
366                  */
367                 err = add_to_swap_cache(new_page, entry);
368                 if (!err) {
369                         /*
370                          * Initiate read into locked page and return.
371                          */
372                         lru_cache_add_active(new_page);
373                         swap_readpage(NULL, new_page);
374                         return new_page;
375                 }
376         } while (err != -ENOENT && err != -ENOMEM);
377
378         if (new_page)
379                 page_cache_release(new_page);
380         return found_page;
381 }