vserver 1.9.5.x5
[linux-2.6.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/syscalls.h>
63 #include <linux/console.h>
64 #include <linux/highmem.h>
65 #include <linux/bio.h>
66
67 #include <asm/uaccess.h>
68 #include <asm/mmu_context.h>
69 #include <asm/pgtable.h>
70 #include <asm/tlbflush.h>
71 #include <asm/io.h>
72
73 #include "power.h"
74
75 /* References to section boundaries */
76 extern const void __nosave_begin, __nosave_end;
77
78 /* Variables to be preserved over suspend */
79 static int pagedir_order_check;
80 static int nr_copy_pages_check;
81
82 extern char resume_file[];
83 static dev_t resume_device;
84 /* Local variables that should not be affected by save */
85 unsigned int nr_copy_pages __nosavedata = 0;
86
87 /* Suspend pagedir is allocated before final copy, therefore it
88    must be freed after resume 
89
90    Warning: this is evil. There are actually two pagedirs at time of
91    resume. One is "pagedir_save", which is empty frame allocated at
92    time of suspend, that must be freed. Second is "pagedir_nosave", 
93    allocated at time of resume, that travels through memory not to
94    collide with anything.
95
96    Warning: this is even more evil than it seems. Pagedirs this file
97    talks about are completely different from page directories used by
98    MMU hardware.
99  */
100 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
101 static suspend_pagedir_t *pagedir_save;
102 static int pagedir_order __nosavedata = 0;
103
104 #define SWSUSP_SIG      "S1SUSPEND"
105
106 static struct swsusp_header {
107         char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
108         swp_entry_t swsusp_info;
109         char    orig_sig[10];
110         char    sig[10];
111 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
112
113 static struct swsusp_info swsusp_info;
114
115 /*
116  * XXX: We try to keep some more pages free so that I/O operations succeed
117  * without paging. Might this be more?
118  */
119 #define PAGES_FOR_IO    512
120
121 /*
122  * Saving part...
123  */
124
125 /* We memorize in swapfile_used what swap devices are used for suspension */
126 #define SWAPFILE_UNUSED    0
127 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
128 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
129
130 static unsigned short swapfile_used[MAX_SWAPFILES];
131 static unsigned short root_swap;
132
133 static int mark_swapfiles(swp_entry_t prev)
134 {
135         int error;
136
137         rw_swap_page_sync(READ, 
138                           swp_entry(root_swap, 0),
139                           virt_to_page((unsigned long)&swsusp_header));
140         if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
141             !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
142                 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
143                 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
144                 swsusp_header.swsusp_info = prev;
145                 error = rw_swap_page_sync(WRITE, 
146                                           swp_entry(root_swap, 0),
147                                           virt_to_page((unsigned long)
148                                                        &swsusp_header));
149         } else {
150                 pr_debug("swsusp: Partition is not swap space.\n");
151                 error = -ENODEV;
152         }
153         return error;
154 }
155
156 /*
157  * Check whether the swap device is the specified resume
158  * device, irrespective of whether they are specified by
159  * identical names.
160  *
161  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
162  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
163  * and they'll be considered the same device.  This is *necessary* for
164  * devfs, since the resume code can only recognize the form /dev/hda4,
165  * but the suspend code would see the long name.)
166  */
167 static int is_resume_device(const struct swap_info_struct *swap_info)
168 {
169         struct file *file = swap_info->swap_file;
170         struct inode *inode = file->f_dentry->d_inode;
171
172         return S_ISBLK(inode->i_mode) &&
173                 resume_device == MKDEV(imajor(inode), iminor(inode));
174 }
175
176 static int swsusp_swap_check(void) /* This is called before saving image */
177 {
178         int i, len;
179         
180         len=strlen(resume_file);
181         root_swap = 0xFFFF;
182         
183         swap_list_lock();
184         for(i=0; i<MAX_SWAPFILES; i++) {
185                 if (swap_info[i].flags == 0) {
186                         swapfile_used[i]=SWAPFILE_UNUSED;
187                 } else {
188                         if(!len) {
189                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
190                                 if(root_swap == 0xFFFF) {
191                                         swapfile_used[i] = SWAPFILE_SUSPEND;
192                                         root_swap = i;
193                                 } else
194                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
195                         } else {
196                                 /* we ignore all swap devices that are not the resume_file */
197                                 if (is_resume_device(&swap_info[i])) {
198                                         swapfile_used[i] = SWAPFILE_SUSPEND;
199                                         root_swap = i;
200                                 } else {
201                                         swapfile_used[i] = SWAPFILE_IGNORED;
202                                 }
203                         }
204                 }
205         }
206         swap_list_unlock();
207         return (root_swap != 0xffff) ? 0 : -ENODEV;
208 }
209
210 /**
211  * This is called after saving image so modification
212  * will be lost after resume... and that's what we want.
213  * we make the device unusable. A new call to
214  * lock_swapdevices can unlock the devices. 
215  */
216 static void lock_swapdevices(void)
217 {
218         int i;
219
220         swap_list_lock();
221         for(i = 0; i< MAX_SWAPFILES; i++)
222                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
223                         swap_info[i].flags ^= 0xFF;
224                 }
225         swap_list_unlock();
226 }
227
228
229
230 /**
231  *      write_swap_page - Write one page to a fresh swap location.
232  *      @addr:  Address we're writing.
233  *      @loc:   Place to store the entry we used.
234  *
235  *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
236  *      errors. That is an artifact left over from swsusp. It did not 
237  *      check the return of rw_swap_page_sync() at all, since most pages
238  *      written back to swap would return -EIO.
239  *      This is a partial improvement, since we will at least return other
240  *      errors, though we need to eventually fix the damn code.
241  */
242
243 static int write_page(unsigned long addr, swp_entry_t * loc)
244 {
245         swp_entry_t entry;
246         int error = 0;
247
248         entry = get_swap_page();
249         if (swp_offset(entry) && 
250             swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
251                 error = rw_swap_page_sync(WRITE, entry,
252                                           virt_to_page(addr));
253                 if (error == -EIO)
254                         error = 0;
255                 if (!error)
256                         *loc = entry;
257         } else
258                 error = -ENOSPC;
259         return error;
260 }
261
262
263 /**
264  *      data_free - Free the swap entries used by the saved image.
265  *
266  *      Walk the list of used swap entries and free each one. 
267  *      This is only used for cleanup when suspend fails.
268  */
269
270 static void data_free(void)
271 {
272         swp_entry_t entry;
273         int i;
274
275         for (i = 0; i < nr_copy_pages; i++) {
276                 entry = (pagedir_nosave + i)->swap_address;
277                 if (entry.val)
278                         swap_free(entry);
279                 else
280                         break;
281                 (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
282         }
283 }
284
285
286 /**
287  *      data_write - Write saved image to swap.
288  *
289  *      Walk the list of pages in the image and sync each one to swap.
290  */
291
292 static int data_write(void)
293 {
294         int error = 0;
295         int i;
296         unsigned int mod = nr_copy_pages / 100;
297
298         if (!mod)
299                 mod = 1;
300
301         printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
302         for (i = 0; i < nr_copy_pages && !error; i++) {
303                 if (!(i%mod))
304                         printk( "\b\b\b\b%3d%%", i / mod );
305                 error = write_page((pagedir_nosave+i)->address,
306                                           &((pagedir_nosave+i)->swap_address));
307         }
308         printk("\b\b\b\bdone\n");
309         return error;
310 }
311
312 static void dump_info(void)
313 {
314         pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
315         pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
316         pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
317         pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
318         pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
319         pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
320         pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
321         pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
322         pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
323         pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
324         pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages);
325 }
326
327 static void init_header(void)
328 {
329         memset(&swsusp_info,0,sizeof(swsusp_info));
330         swsusp_info.version_code = LINUX_VERSION_CODE;
331         swsusp_info.num_physpages = num_physpages;
332         memcpy(&swsusp_info.uts,&system_utsname,sizeof(system_utsname));
333
334         swsusp_info.suspend_pagedir = pagedir_nosave;
335         swsusp_info.cpus = num_online_cpus();
336         swsusp_info.image_pages = nr_copy_pages;
337         dump_info();
338 }
339
340 static int close_swap(void)
341 {
342         swp_entry_t entry;
343         int error;
344
345         error = write_page((unsigned long)&swsusp_info,&entry);
346         if (!error) { 
347                 printk( "S" );
348                 error = mark_swapfiles(entry);
349                 printk( "|\n" );
350         }
351         return error;
352 }
353
354 /**
355  *      free_pagedir_entries - Free pages used by the page directory.
356  *
357  *      This is used during suspend for error recovery.
358  */
359
360 static void free_pagedir_entries(void)
361 {
362         int i;
363
364         for (i = 0; i < swsusp_info.pagedir_pages; i++)
365                 swap_free(swsusp_info.pagedir[i]);
366 }
367
368
369 /**
370  *      write_pagedir - Write the array of pages holding the page directory.
371  *      @last:  Last swap entry we write (needed for header).
372  */
373
374 static int write_pagedir(void)
375 {
376         unsigned long addr = (unsigned long)pagedir_nosave;
377         int error = 0;
378         int n = SUSPEND_PD_PAGES(nr_copy_pages);
379         int i;
380
381         swsusp_info.pagedir_pages = n;
382         printk( "Writing pagedir (%d pages)\n", n);
383         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE)
384                 error = write_page(addr, &swsusp_info.pagedir[i]);
385         return error;
386 }
387
388 /**
389  *      write_suspend_image - Write entire image and metadata.
390  *
391  */
392
393 static int write_suspend_image(void)
394 {
395         int error;
396
397         init_header();
398         if ((error = data_write()))
399                 goto FreeData;
400
401         if ((error = write_pagedir()))
402                 goto FreePagedir;
403
404         if ((error = close_swap()))
405                 goto FreePagedir;
406  Done:
407         return error;
408  FreePagedir:
409         free_pagedir_entries();
410  FreeData:
411         data_free();
412         goto Done;
413 }
414
415
416 #ifdef CONFIG_HIGHMEM
417 struct highmem_page {
418         char *data;
419         struct page *page;
420         struct highmem_page *next;
421 };
422
423 static struct highmem_page *highmem_copy;
424
425 static int save_highmem_zone(struct zone *zone)
426 {
427         unsigned long zone_pfn;
428         mark_free_pages(zone);
429         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
430                 struct page *page;
431                 struct highmem_page *save;
432                 void *kaddr;
433                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
434
435                 if (!(pfn%1000))
436                         printk(".");
437                 if (!pfn_valid(pfn))
438                         continue;
439                 page = pfn_to_page(pfn);
440                 /*
441                  * This condition results from rvmalloc() sans vmalloc_32()
442                  * and architectural memory reservations. This should be
443                  * corrected eventually when the cases giving rise to this
444                  * are better understood.
445                  */
446                 if (PageReserved(page)) {
447                         printk("highmem reserved page?!\n");
448                         continue;
449                 }
450                 BUG_ON(PageNosave(page));
451                 if (PageNosaveFree(page))
452                         continue;
453                 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
454                 if (!save)
455                         return -ENOMEM;
456                 save->next = highmem_copy;
457                 save->page = page;
458                 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
459                 if (!save->data) {
460                         kfree(save);
461                         return -ENOMEM;
462                 }
463                 kaddr = kmap_atomic(page, KM_USER0);
464                 memcpy(save->data, kaddr, PAGE_SIZE);
465                 kunmap_atomic(kaddr, KM_USER0);
466                 highmem_copy = save;
467         }
468         return 0;
469 }
470 #endif /* CONFIG_HIGHMEM */
471
472
473 static int save_highmem(void)
474 {
475 #ifdef CONFIG_HIGHMEM
476         struct zone *zone;
477         int res = 0;
478
479         pr_debug("swsusp: Saving Highmem\n");
480         for_each_zone(zone) {
481                 if (is_highmem(zone))
482                         res = save_highmem_zone(zone);
483                 if (res)
484                         return res;
485         }
486 #endif
487         return 0;
488 }
489
490 static int restore_highmem(void)
491 {
492 #ifdef CONFIG_HIGHMEM
493         printk("swsusp: Restoring Highmem\n");
494         while (highmem_copy) {
495                 struct highmem_page *save = highmem_copy;
496                 void *kaddr;
497                 highmem_copy = save->next;
498
499                 kaddr = kmap_atomic(save->page, KM_USER0);
500                 memcpy(kaddr, save->data, PAGE_SIZE);
501                 kunmap_atomic(kaddr, KM_USER0);
502                 free_page((long) save->data);
503                 kfree(save);
504         }
505 #endif
506         return 0;
507 }
508
509
510 static int pfn_is_nosave(unsigned long pfn)
511 {
512         unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
513         unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
514         return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
515 }
516
517 /**
518  *      saveable - Determine whether a page should be cloned or not.
519  *      @pfn:   The page
520  *
521  *      We save a page if it's Reserved, and not in the range of pages
522  *      statically defined as 'unsaveable', or if it isn't reserved, and
523  *      isn't part of a free chunk of pages.
524  */
525
526 static int saveable(struct zone * zone, unsigned long * zone_pfn)
527 {
528         unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
529         struct page * page;
530
531         if (!pfn_valid(pfn))
532                 return 0;
533
534         page = pfn_to_page(pfn);
535         BUG_ON(PageReserved(page) && PageNosave(page));
536         if (PageNosave(page))
537                 return 0;
538         if (PageReserved(page) && pfn_is_nosave(pfn)) {
539                 pr_debug("[nosave pfn 0x%lx]", pfn);
540                 return 0;
541         }
542         if (PageNosaveFree(page))
543                 return 0;
544
545         return 1;
546 }
547
548 static void count_data_pages(void)
549 {
550         struct zone *zone;
551         unsigned long zone_pfn;
552
553         nr_copy_pages = 0;
554
555         for_each_zone(zone) {
556                 if (is_highmem(zone))
557                         continue;
558                 mark_free_pages(zone);
559                 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
560                         nr_copy_pages += saveable(zone, &zone_pfn);
561         }
562 }
563
564
565 static void copy_data_pages(void)
566 {
567         struct zone *zone;
568         unsigned long zone_pfn;
569         struct pbe * pbe = pagedir_nosave;
570         int to_copy = nr_copy_pages;
571         
572         for_each_zone(zone) {
573                 if (is_highmem(zone))
574                         continue;
575                 mark_free_pages(zone);
576                 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
577                         if (saveable(zone, &zone_pfn)) {
578                                 struct page * page;
579                                 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
580                                 pbe->orig_address = (long) page_address(page);
581                                 /* copy_page is not usable for copying task structs. */
582                                 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
583                                 pbe++;
584                                 to_copy--;
585                         }
586                 }
587         }
588         BUG_ON(to_copy);
589 }
590
591
592 /**
593  *      calc_order - Determine the order of allocation needed for pagedir_save.
594  *
595  *      This looks tricky, but is just subtle. Please fix it some time.
596  *      Since there are %nr_copy_pages worth of pages in the snapshot, we need
597  *      to allocate enough contiguous space to hold 
598  *              (%nr_copy_pages * sizeof(struct pbe)), 
599  *      which has the saved/orig locations of the page.. 
600  *
601  *      SUSPEND_PD_PAGES() tells us how many pages we need to hold those 
602  *      structures, then we call get_bitmask_order(), which will tell us the
603  *      last bit set in the number, starting with 1. (If we need 30 pages, that
604  *      is 0x0000001e in hex. The last bit is the 5th, which is the order we 
605  *      would use to allocate 32 contiguous pages).
606  *
607  *      Since we also need to save those pages, we add the number of pages that
608  *      we need to nr_copy_pages, and in case of an overflow, do the 
609  *      calculation again to update the number of pages needed. 
610  *
611  *      With this model, we will tend to waste a lot of memory if we just cross
612  *      an order boundary. Plus, the higher the order of allocation that we try
613  *      to do, the more likely we are to fail in a low-memory situtation 
614  *      (though we're unlikely to get this far in such a case, since swsusp 
615  *      requires half of memory to be free anyway).
616  */
617
618
619 static void calc_order(void)
620 {
621         int diff = 0;
622         int order = 0;
623
624         do {
625                 diff = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages)) - order;
626                 if (diff) {
627                         order += diff;
628                         nr_copy_pages += 1 << diff;
629                 }
630         } while(diff);
631         pagedir_order = order;
632 }
633
634
635 /**
636  *      alloc_pagedir - Allocate the page directory.
637  *
638  *      First, determine exactly how many contiguous pages we need and
639  *      allocate them.
640  */
641
642 static int alloc_pagedir(void)
643 {
644         calc_order();
645         pagedir_save = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD,
646                                                              pagedir_order);
647         if (!pagedir_save)
648                 return -ENOMEM;
649         memset(pagedir_save, 0, (1 << pagedir_order) * PAGE_SIZE);
650         pagedir_nosave = pagedir_save;
651         return 0;
652 }
653
654 /**
655  *      free_image_pages - Free pages allocated for snapshot
656  */
657
658 static void free_image_pages(void)
659 {
660         struct pbe * p;
661         int i;
662
663         p = pagedir_save;
664         for (i = 0, p = pagedir_save; i < nr_copy_pages; i++, p++) {
665                 if (p->address) {
666                         ClearPageNosave(virt_to_page(p->address));
667                         free_page(p->address);
668                         p->address = 0;
669                 }
670         }
671 }
672
673 /**
674  *      alloc_image_pages - Allocate pages for the snapshot.
675  *
676  */
677
678 static int alloc_image_pages(void)
679 {
680         struct pbe * p;
681         int i;
682
683         for (i = 0, p = pagedir_save; i < nr_copy_pages; i++, p++) {
684                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
685                 if (!p->address)
686                         return -ENOMEM;
687                 SetPageNosave(virt_to_page(p->address));
688         }
689         return 0;
690 }
691
692 void swsusp_free(void)
693 {
694         BUG_ON(PageNosave(virt_to_page(pagedir_save)));
695         BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
696         free_image_pages();
697         free_pages((unsigned long) pagedir_save, pagedir_order);
698 }
699
700
701 /**
702  *      enough_free_mem - Make sure we enough free memory to snapshot.
703  *
704  *      Returns TRUE or FALSE after checking the number of available 
705  *      free pages.
706  */
707
708 static int enough_free_mem(void)
709 {
710         if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
711                 pr_debug("swsusp: Not enough free pages: Have %d\n",
712                          nr_free_pages());
713                 return 0;
714         }
715         return 1;
716 }
717
718
719 /**
720  *      enough_swap - Make sure we have enough swap to save the image.
721  *
722  *      Returns TRUE or FALSE after checking the total amount of swap 
723  *      space avaiable.
724  *
725  *      FIXME: si_swapinfo(&i) returns all swap devices information.
726  *      We should only consider resume_device. 
727  */
728
729 static int enough_swap(void)
730 {
731         struct sysinfo i;
732
733         si_swapinfo(&i);
734         if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
735                 pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
736                 return 0;
737         }
738         return 1;
739 }
740
741 static int swsusp_alloc(void)
742 {
743         int error;
744
745         pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
746                  nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
747
748         pagedir_nosave = NULL;
749         if (!enough_free_mem())
750                 return -ENOMEM;
751
752         if (!enough_swap())
753                 return -ENOSPC;
754
755         if ((error = alloc_pagedir())) {
756                 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
757                 return error;
758         }
759         if ((error = alloc_image_pages())) {
760                 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
761                 swsusp_free();
762                 return error;
763         }
764
765         nr_copy_pages_check = nr_copy_pages;
766         pagedir_order_check = pagedir_order;
767         return 0;
768 }
769
770 static int suspend_prepare_image(void)
771 {
772         int error;
773
774         pr_debug("swsusp: critical section: \n");
775         if (save_highmem()) {
776                 printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
777                 restore_highmem();
778                 return -ENOMEM;
779         }
780
781         drain_local_pages();
782         count_data_pages();
783         printk("swsusp: Need to copy %u pages\n",nr_copy_pages);
784
785         error = swsusp_alloc();
786         if (error)
787                 return error;
788         
789         /* During allocating of suspend pagedir, new cold pages may appear. 
790          * Kill them.
791          */
792         drain_local_pages();
793         copy_data_pages();
794
795         /*
796          * End of critical section. From now on, we can write to memory,
797          * but we should not touch disk. This specially means we must _not_
798          * touch swap space! Except we must write out our image of course.
799          */
800
801         printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
802         return 0;
803 }
804
805
806 /* It is important _NOT_ to umount filesystems at this point. We want
807  * them synced (in case something goes wrong) but we DO not want to mark
808  * filesystem clean: it is not. (And it does not matter, if we resume
809  * correctly, we'll mark system clean, anyway.)
810  */
811 int swsusp_write(void)
812 {
813         int error;
814         device_resume();
815         lock_swapdevices();
816         error = write_suspend_image();
817         /* This will unlock ignored swap devices since writing is finished */
818         lock_swapdevices();
819         return error;
820
821 }
822
823
824 extern asmlinkage int swsusp_arch_suspend(void);
825 extern asmlinkage int swsusp_arch_resume(void);
826
827
828 asmlinkage int swsusp_save(void)
829 {
830         int error = 0;
831
832         if ((error = swsusp_swap_check())) {
833                 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
834                                 "swapon -a!\n");
835                 return error;
836         }
837         return suspend_prepare_image();
838 }
839
840 int swsusp_suspend(void)
841 {
842         int error;
843         if ((error = arch_prepare_suspend()))
844                 return error;
845         local_irq_disable();
846         /* At this point, device_suspend() has been called, but *not*
847          * device_power_down(). We *must* device_power_down() now.
848          * Otherwise, drivers for some devices (e.g. interrupt controllers)
849          * become desynchronized with the actual state of the hardware
850          * at resume time, and evil weirdness ensues.
851          */
852         if ((error = device_power_down(PMSG_FREEZE))) {
853                 local_irq_enable();
854                 return error;
855         }
856         save_processor_state();
857         error = swsusp_arch_suspend();
858         /* Restore control flow magically appears here */
859         restore_processor_state();
860         restore_highmem();
861         device_power_up();
862         local_irq_enable();
863         return error;
864 }
865
866
867 asmlinkage int swsusp_restore(void)
868 {
869         BUG_ON (nr_copy_pages_check != nr_copy_pages);
870         BUG_ON (pagedir_order_check != pagedir_order);
871         
872         /* Even mappings of "global" things (vmalloc) need to be fixed */
873         __flush_tlb_global();
874         return 0;
875 }
876
877 int swsusp_resume(void)
878 {
879         int error;
880         local_irq_disable();
881         device_power_down(PMSG_FREEZE);
882         /* We'll ignore saved state, but this gets preempt count (etc) right */
883         save_processor_state();
884         error = swsusp_arch_resume();
885         /* Code below is only ever reached in case of failure. Otherwise
886          * execution continues at place where swsusp_arch_suspend was called
887          */
888         BUG_ON(!error);
889         restore_processor_state();
890         restore_highmem();
891         device_power_up();
892         local_irq_enable();
893         return error;
894 }
895
896 /* More restore stuff */
897
898 /*
899  * Returns true if given address/order collides with any orig_address 
900  */
901 static int __init does_collide_order(unsigned long addr, int order)
902 {
903         int i;
904         
905         for (i=0; i < (1<<order); i++)
906                 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
907                         return 1;
908         return 0;
909 }
910
911 /*
912  * We check here that pagedir & pages it points to won't collide with pages
913  * where we're going to restore from the loaded pages later
914  */
915 static int __init check_pagedir(void)
916 {
917         int i;
918
919         for(i=0; i < nr_copy_pages; i++) {
920                 unsigned long addr;
921
922                 do {
923                         addr = get_zeroed_page(GFP_ATOMIC);
924                         if(!addr)
925                                 return -ENOMEM;
926                 } while (does_collide_order(addr, 0));
927
928                 (pagedir_nosave+i)->address = addr;
929         }
930         return 0;
931 }
932
933 static int __init swsusp_pagedir_relocate(void)
934 {
935         /*
936          * We have to avoid recursion (not to overflow kernel stack),
937          * and that's why code looks pretty cryptic 
938          */
939         suspend_pagedir_t *old_pagedir = pagedir_nosave;
940         void **eaten_memory = NULL;
941         void **c = eaten_memory, *m, *f;
942         int ret = 0;
943         struct zone *zone;
944         int i;
945         struct pbe *p;
946         unsigned long zone_pfn;
947
948         printk("Relocating pagedir ");
949
950         /* Set page flags */
951
952         for_each_zone(zone) {
953                 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
954                         SetPageNosaveFree(pfn_to_page(zone_pfn +
955                                         zone->zone_start_pfn));
956         }
957
958         /* Clear orig address */
959
960         for(i = 0, p = pagedir_nosave; i < nr_copy_pages; i++, p++) {
961                 ClearPageNosaveFree(virt_to_page(p->orig_address));
962         }
963
964         if (!does_collide_order((unsigned long)old_pagedir, pagedir_order)) {
965                 printk("not necessary\n");
966                 return check_pagedir();
967         }
968
969         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) {
970                 if (!does_collide_order((unsigned long)m, pagedir_order))
971                         break;
972                 eaten_memory = m;
973                 printk( "." ); 
974                 *eaten_memory = c;
975                 c = eaten_memory;
976         }
977
978         if (!m) {
979                 printk("out of memory\n");
980                 ret = -ENOMEM;
981         } else {
982                 pagedir_nosave =
983                         memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order);
984         }
985
986         c = eaten_memory;
987         while (c) {
988                 printk(":");
989                 f = c;
990                 c = *c;
991                 free_pages((unsigned long)f, pagedir_order);
992         }
993         if (ret)
994                 return ret;
995         printk("|\n");
996         return check_pagedir();
997 }
998
999 /**
1000  *      Using bio to read from swap.
1001  *      This code requires a bit more work than just using buffer heads
1002  *      but, it is the recommended way for 2.5/2.6.
1003  *      The following are to signal the beginning and end of I/O. Bios
1004  *      finish asynchronously, while we want them to happen synchronously.
1005  *      A simple atomic_t, and a wait loop take care of this problem.
1006  */
1007
1008 static atomic_t io_done = ATOMIC_INIT(0);
1009
1010 static int end_io(struct bio * bio, unsigned int num, int err)
1011 {
1012         if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1013                 panic("I/O error reading memory image");
1014         atomic_set(&io_done, 0);
1015         return 0;
1016 }
1017
1018 static struct block_device * resume_bdev;
1019
1020 /**
1021  *      submit - submit BIO request.
1022  *      @rw:    READ or WRITE.
1023  *      @off    physical offset of page.
1024  *      @page:  page we're reading or writing.
1025  *
1026  *      Straight from the textbook - allocate and initialize the bio.
1027  *      If we're writing, make sure the page is marked as dirty.
1028  *      Then submit it and wait.
1029  */
1030
1031 static int submit(int rw, pgoff_t page_off, void * page)
1032 {
1033         int error = 0;
1034         struct bio * bio;
1035
1036         bio = bio_alloc(GFP_ATOMIC, 1);
1037         if (!bio)
1038                 return -ENOMEM;
1039         bio->bi_sector = page_off * (PAGE_SIZE >> 9);
1040         bio_get(bio);
1041         bio->bi_bdev = resume_bdev;
1042         bio->bi_end_io = end_io;
1043
1044         if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
1045                 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
1046                 error = -EFAULT;
1047                 goto Done;
1048         }
1049
1050         if (rw == WRITE)
1051                 bio_set_pages_dirty(bio);
1052
1053         atomic_set(&io_done, 1);
1054         submit_bio(rw | (1 << BIO_RW_SYNC), bio);
1055         while (atomic_read(&io_done))
1056                 yield();
1057
1058  Done:
1059         bio_put(bio);
1060         return error;
1061 }
1062
1063 static int bio_read_page(pgoff_t page_off, void * page)
1064 {
1065         return submit(READ, page_off, page);
1066 }
1067
1068 static int bio_write_page(pgoff_t page_off, void * page)
1069 {
1070         return submit(WRITE, page_off, page);
1071 }
1072
1073 /*
1074  * Sanity check if this image makes sense with this kernel/swap context
1075  * I really don't think that it's foolproof but more than nothing..
1076  */
1077
1078 static const char * __init sanity_check(void)
1079 {
1080         dump_info();
1081         if(swsusp_info.version_code != LINUX_VERSION_CODE)
1082                 return "kernel version";
1083         if(swsusp_info.num_physpages != num_physpages)
1084                 return "memory size";
1085         if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
1086                 return "system type";
1087         if (strcmp(swsusp_info.uts.release,system_utsname.release))
1088                 return "kernel release";
1089         if (strcmp(swsusp_info.uts.version,system_utsname.version))
1090                 return "version";
1091         if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1092                 return "machine";
1093         if(swsusp_info.cpus != num_online_cpus())
1094                 return "number of cpus";
1095         return NULL;
1096 }
1097
1098
1099 static int __init check_header(void)
1100 {
1101         const char * reason = NULL;
1102         int error;
1103
1104         if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
1105                 return error;
1106
1107         /* Is this same machine? */
1108         if ((reason = sanity_check())) {
1109                 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
1110                 return -EPERM;
1111         }
1112         nr_copy_pages = swsusp_info.image_pages;
1113         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
1114         return error;
1115 }
1116
1117 static int __init check_sig(void)
1118 {
1119         int error;
1120
1121         memset(&swsusp_header, 0, sizeof(swsusp_header));
1122         if ((error = bio_read_page(0, &swsusp_header)))
1123                 return error;
1124         if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1125                 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1126
1127                 /*
1128                  * Reset swap signature now.
1129                  */
1130                 error = bio_write_page(0, &swsusp_header);
1131         } else { 
1132                 pr_debug(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
1133                 return -EINVAL;
1134         }
1135         if (!error)
1136                 pr_debug("swsusp: Signature found, resuming\n");
1137         return error;
1138 }
1139
1140 /**
1141  *      swsusp_read_data - Read image pages from swap.
1142  *
1143  *      You do not need to check for overlaps, check_pagedir()
1144  *      already did that.
1145  */
1146
1147 static int __init data_read(void)
1148 {
1149         struct pbe * p;
1150         int error;
1151         int i;
1152         int mod = nr_copy_pages / 100;
1153
1154         if (!mod)
1155                 mod = 1;
1156
1157         if ((error = swsusp_pagedir_relocate()))
1158                 return error;
1159
1160         printk( "Reading image data (%d pages):     ", nr_copy_pages );
1161         for(i = 0, p = pagedir_nosave; i < nr_copy_pages && !error; i++, p++) {
1162                 if (!(i%mod))
1163                         printk( "\b\b\b\b%3d%%", i / mod );
1164                 error = bio_read_page(swp_offset(p->swap_address),
1165                                   (void *)p->address);
1166         }
1167         printk(" %d done.\n",i);
1168         return error;
1169
1170 }
1171
1172 extern dev_t __init name_to_dev_t(const char *line);
1173
1174 static int __init read_pagedir(void)
1175 {
1176         unsigned long addr;
1177         int i, n = swsusp_info.pagedir_pages;
1178         int error = 0;
1179
1180         addr = __get_free_pages(GFP_ATOMIC, pagedir_order);
1181         if (!addr)
1182                 return -ENOMEM;
1183         pagedir_nosave = (struct pbe *)addr;
1184
1185         pr_debug("swsusp: Reading pagedir (%d Pages)\n",n);
1186
1187         for (i = 0; i < n && !error; i++, addr += PAGE_SIZE) {
1188                 unsigned long offset = swp_offset(swsusp_info.pagedir[i]);
1189                 if (offset)
1190                         error = bio_read_page(offset, (void *)addr);
1191                 else
1192                         error = -EFAULT;
1193         }
1194         if (error)
1195                 free_pages((unsigned long)pagedir_nosave, pagedir_order);
1196         return error;
1197 }
1198
1199 static int __init read_suspend_image(void)
1200 {
1201         int error = 0;
1202
1203         if ((error = check_sig()))
1204                 return error;
1205         if ((error = check_header()))
1206                 return error;
1207         if ((error = read_pagedir()))
1208                 return error;
1209         if ((error = data_read()))
1210                 free_pages((unsigned long)pagedir_nosave, pagedir_order);
1211         return error;
1212 }
1213
1214 /**
1215  *      swsusp_read - Read saved image from swap.
1216  */
1217
1218 int __init swsusp_read(void)
1219 {
1220         int error;
1221
1222         if (!strlen(resume_file))
1223                 return -ENOENT;
1224
1225         resume_device = name_to_dev_t(resume_file);
1226         pr_debug("swsusp: Resume From Partition: %s\n", resume_file);
1227
1228         resume_bdev = open_by_devnum(resume_device, FMODE_READ);
1229         if (!IS_ERR(resume_bdev)) {
1230                 set_blocksize(resume_bdev, PAGE_SIZE);
1231                 error = read_suspend_image();
1232                 blkdev_put(resume_bdev);
1233         } else
1234                 error = PTR_ERR(resume_bdev);
1235
1236         if (!error)
1237                 pr_debug("Reading resume file was successful\n");
1238         else
1239                 pr_debug("swsusp: Error %d resuming\n", error);
1240         return error;
1241 }