ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.6.tar.bz2
[linux-2.6.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/syscalls.h>
63 #include <linux/console.h>
64 #include <linux/highmem.h>
65
66 #include <asm/uaccess.h>
67 #include <asm/mmu_context.h>
68 #include <asm/pgtable.h>
69 #include <asm/io.h>
70
71 #include "power.h"
72
73 unsigned char software_suspend_enabled = 0;
74
75 #define NORESUME                1
76 #define RESUME_SPECIFIED        2
77
78 /* References to section boundaries */
79 extern char __nosave_begin, __nosave_end;
80
81 extern int is_head_of_free_region(struct page *);
82
83 /* Locks */
84 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
85
86 /* Variables to be preserved over suspend */
87 static int pagedir_order_check;
88 static int nr_copy_pages_check;
89
90 static int resume_status;
91 static char resume_file[256] = "";                      /* For resume= kernel option */
92 static dev_t resume_device;
93 /* Local variables that should not be affected by save */
94 unsigned int nr_copy_pages __nosavedata = 0;
95
96 /* Suspend pagedir is allocated before final copy, therefore it
97    must be freed after resume 
98
99    Warning: this is evil. There are actually two pagedirs at time of
100    resume. One is "pagedir_save", which is empty frame allocated at
101    time of suspend, that must be freed. Second is "pagedir_nosave", 
102    allocated at time of resume, that travels through memory not to
103    collide with anything.
104
105    Warning: this is even more evil than it seems. Pagedirs this file
106    talks about are completely different from page directories used by
107    MMU hardware.
108  */
109 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
110 static suspend_pagedir_t *pagedir_save;
111 static int pagedir_order __nosavedata = 0;
112
113 struct link {
114         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
115         swp_entry_t next;
116 };
117
118 union diskpage {
119         union swap_header swh;
120         struct link link;
121         struct suspend_header sh;
122 };
123
124 /*
125  * XXX: We try to keep some more pages free so that I/O operations succeed
126  * without paging. Might this be more?
127  */
128 #define PAGES_FOR_IO    512
129
130 static const char name_suspend[] = "Suspend Machine: ";
131 static const char name_resume[] = "Resume Machine: ";
132
133 /*
134  * Debug
135  */
136 #define DEBUG_DEFAULT
137 #undef  DEBUG_PROCESS
138 #undef  DEBUG_SLOW
139 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
140
141 #ifdef DEBUG_DEFAULT
142 # define PRINTK(f, a...)        printk(f, ## a)
143 #else
144 # define PRINTK(f, a...)        do { } while(0)
145 #endif
146
147 #ifdef DEBUG_SLOW
148 #define MDELAY(a) mdelay(a)
149 #else
150 #define MDELAY(a) do { } while(0)
151 #endif
152
153 /*
154  * Saving part...
155  */
156
157 static __inline__ int fill_suspend_header(struct suspend_header *sh)
158 {
159         memset((char *)sh, 0, sizeof(*sh));
160
161         sh->version_code = LINUX_VERSION_CODE;
162         sh->num_physpages = num_physpages;
163         strncpy(sh->machine, system_utsname.machine, 8);
164         strncpy(sh->version, system_utsname.version, 20);
165         /* FIXME: Is this bogus? --RR */
166         sh->num_cpus = num_online_cpus();
167         sh->page_size = PAGE_SIZE;
168         sh->suspend_pagedir = pagedir_nosave;
169         BUG_ON (pagedir_save != pagedir_nosave);
170         sh->num_pbes = nr_copy_pages;
171         /* TODO: needed? mounted fs' last mounted date comparison
172          * [so they haven't been mounted since last suspend.
173          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
174          */
175         return 0;
176 }
177
178 /* We memorize in swapfile_used what swap devices are used for suspension */
179 #define SWAPFILE_UNUSED    0
180 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
181 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
182
183 static unsigned short swapfile_used[MAX_SWAPFILES];
184 static unsigned short root_swap;
185 #define MARK_SWAP_SUSPEND 0
186 #define MARK_SWAP_RESUME 2
187
188 static void mark_swapfiles(swp_entry_t prev, int mode)
189 {
190         swp_entry_t entry;
191         union diskpage *cur;
192         struct page *page;
193
194         if (root_swap == 0xFFFF)  /* ignored */
195                 return;
196
197         page = alloc_page(GFP_ATOMIC);
198         if (!page)
199                 panic("Out of memory in mark_swapfiles");
200         cur = page_address(page);
201         /* XXX: this is dirty hack to get first page of swap file */
202         entry = swp_entry(root_swap, 0);
203         rw_swap_page_sync(READ, entry, page);
204
205         if (mode == MARK_SWAP_RESUME) {
206                 if (!memcmp("S1",cur->swh.magic.magic,2))
207                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
208                 else if (!memcmp("S2",cur->swh.magic.magic,2))
209                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
210                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
211                         name_resume, cur->swh.magic.magic);
212         } else {
213                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
214                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
215                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
216                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
217                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
218                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
219                 /* link.next lies *no more* in last 4/8 bytes of magic */
220         }
221         rw_swap_page_sync(WRITE, entry, page);
222         __free_page(page);
223 }
224
225 static void read_swapfiles(void) /* This is called before saving image */
226 {
227         int i, len;
228         static char buff[sizeof(resume_file)], *sname;
229         
230         len=strlen(resume_file);
231         root_swap = 0xFFFF;
232         
233         swap_list_lock();
234         for(i=0; i<MAX_SWAPFILES; i++) {
235                 if (swap_info[i].flags == 0) {
236                         swapfile_used[i]=SWAPFILE_UNUSED;
237                 } else {
238                         if(!len) {
239                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
240                                 if(root_swap == 0xFFFF) {
241                                         swapfile_used[i] = SWAPFILE_SUSPEND;
242                                         root_swap = i;
243                                 } else
244                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
245                         } else {
246                                 /* we ignore all swap devices that are not the resume_file */
247                                 sname = d_path(swap_info[i].swap_file->f_dentry,
248                                                swap_info[i].swap_file->f_vfsmnt,
249                                                buff,
250                                                sizeof(buff));
251                                 if (!strcmp(sname, resume_file)) {
252                                         swapfile_used[i] = SWAPFILE_SUSPEND;
253                                         root_swap = i;
254                                 } else {
255 #if 0
256                                         printk( "Resume: device %s (%x != %x) ignored\n", swap_info[i].swap_file->d_name.name, swap_info[i].swap_device, resume_device );                                 
257 #endif
258                                         swapfile_used[i] = SWAPFILE_IGNORED;
259                                 }
260                         }
261                 }
262         }
263         swap_list_unlock();
264 }
265
266 static void lock_swapdevices(void) /* This is called after saving image so modification
267                                       will be lost after resume... and that's what we want. */
268 {
269         int i;
270
271         swap_list_lock();
272         for(i = 0; i< MAX_SWAPFILES; i++)
273                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
274                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
275                                                        lock_swapdevices can unlock the devices. */
276                 }
277         swap_list_unlock();
278 }
279
280 /**
281  *    write_suspend_image - Write entire image to disk.
282  *
283  *    After writing suspend signature to the disk, suspend may no
284  *    longer fail: we have ready-to-run image in swap, and rollback
285  *    would happen on next reboot -- corrupting data.
286  *
287  *    Note: The buffer we allocate to use to write the suspend header is
288  *    not freed; its not needed since the system is going down anyway
289  *    (plus it causes an oops and I'm lazy^H^H^H^Htoo busy).
290  */
291 static int write_suspend_image(void)
292 {
293         int i;
294         swp_entry_t entry, prev = { 0 };
295         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
296         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
297         unsigned long address;
298         struct page *page;
299
300         if (!buffer)
301                 return -ENOMEM;
302
303         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
304         for (i=0; i<nr_copy_pages; i++) {
305                 if (!(i%100))
306                         printk( "." );
307                 if (!(entry = get_swap_page()).val)
308                         panic("\nNot enough swapspace when writing data" );
309                 
310                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
311                         panic("\nPage %d: not enough swapspace on suspend device", i );
312             
313                 address = (pagedir_nosave+i)->address;
314                 page = virt_to_page(address);
315                 rw_swap_page_sync(WRITE, entry, page);
316                 (pagedir_nosave+i)->swap_address = entry;
317         }
318         printk( "|\n" );
319         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
320         for (i=0; i<nr_pgdir_pages; i++) {
321                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
322                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
323                 printk( "." );
324                 if (!(entry = get_swap_page()).val) {
325                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
326                         panic("Don't know how to recover");
327                         free_page((unsigned long) buffer);
328                         return -ENOSPC;
329                 }
330
331                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
332                         panic("\nNot enough swapspace for pagedir on suspend device" );
333
334                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
335                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
336
337                 cur->link.next = prev;                          
338                 page = virt_to_page((unsigned long)cur);
339                 rw_swap_page_sync(WRITE, entry, page);
340                 prev = entry;
341         }
342         printk("H");
343         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
344         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
345         BUG_ON (sizeof(struct link) != PAGE_SIZE);
346         if (!(entry = get_swap_page()).val)
347                 panic( "\nNot enough swapspace when writing header" );
348         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
349                 panic("\nNot enough swapspace for header on suspend device" );
350
351         cur = (void *) buffer;
352         if (fill_suspend_header(&cur->sh))
353                 BUG();          /* Not a BUG_ON(): we want fill_suspend_header to be called, always */
354                 
355         cur->link.next = prev;
356
357         page = virt_to_page((unsigned long)cur);
358         rw_swap_page_sync(WRITE, entry, page);
359         prev = entry;
360
361         printk( "S" );
362         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
363         printk( "|\n" );
364
365         MDELAY(1000);
366         return 0;
367 }
368
369 #ifdef CONFIG_HIGHMEM
370 struct highmem_page {
371         char *data;
372         struct page *page;
373         struct highmem_page *next;
374 };
375
376 struct highmem_page *highmem_copy = NULL;
377
378 static int save_highmem_zone(struct zone *zone)
379 {
380         unsigned long zone_pfn;
381         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
382                 struct page *page;
383                 struct highmem_page *save;
384                 void *kaddr;
385                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
386                 int chunk_size;
387
388                 if (!(pfn%1000))
389                         printk(".");
390                 if (!pfn_valid(pfn))
391                         continue;
392                 page = pfn_to_page(pfn);
393                 /*
394                  * This condition results from rvmalloc() sans vmalloc_32()
395                  * and architectural memory reservations. This should be
396                  * corrected eventually when the cases giving rise to this
397                  * are better understood.
398                  */
399                 if (PageReserved(page)) {
400                         printk("highmem reserved page?!\n");
401                         continue;
402                 }
403                 if ((chunk_size = is_head_of_free_region(page))) {
404                         pfn += chunk_size - 1;
405                         zone_pfn += chunk_size - 1;
406                         continue;
407                 }
408                 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
409                 if (!save)
410                         return -ENOMEM;
411                 save->next = highmem_copy;
412                 save->page = page;
413                 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
414                 if (!save->data) {
415                         kfree(save);
416                         return -ENOMEM;
417                 }
418                 kaddr = kmap_atomic(page, KM_USER0);
419                 memcpy(save->data, kaddr, PAGE_SIZE);
420                 kunmap_atomic(kaddr, KM_USER0);
421                 highmem_copy = save;
422         }
423         return 0;
424 }
425
426 static int save_highmem(void)
427 {
428         struct zone *zone;
429         int res = 0;
430         for_each_zone(zone) {
431                 if (is_highmem(zone))
432                         res = save_highmem_zone(zone);
433                 if (res)
434                         return res;
435         }
436         return 0;
437 }
438
439 static int restore_highmem(void)
440 {
441         while (highmem_copy) {
442                 struct highmem_page *save = highmem_copy;
443                 void *kaddr;
444                 highmem_copy = save->next;
445
446                 kaddr = kmap_atomic(save->page, KM_USER0);
447                 memcpy(kaddr, save->data, PAGE_SIZE);
448                 kunmap_atomic(kaddr, KM_USER0);
449                 free_page((long) save->data);
450                 kfree(save);
451         }
452         return 0;
453 }
454 #endif
455
456 static int pfn_is_nosave(unsigned long pfn)
457 {
458         unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
459         unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
460         return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
461 }
462
463 /* if *pagedir_p != NULL it also copies the counted pages */
464 static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
465 {
466         unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
467         struct pbe *pbe = *pagedir_p;
468         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
469                 struct page *page;
470                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
471
472                 if (!(pfn%1000))
473                         printk(".");
474                 if (!pfn_valid(pfn))
475                         continue;
476                 page = pfn_to_page(pfn);
477                 BUG_ON(PageReserved(page) && PageNosave(page));
478                 if (PageNosave(page))
479                         continue;
480                 if (PageReserved(page) && pfn_is_nosave(pfn)) {
481                         PRINTK("[nosave pfn 0x%lx]", pfn);
482                         continue;
483                 }
484                 if ((chunk_size = is_head_of_free_region(page))) {
485                         pfn += chunk_size - 1;
486                         zone_pfn += chunk_size - 1;
487                         continue;
488                 }
489                 nr_copy_pages++;
490                 if (!pbe)
491                         continue;
492                 pbe->orig_address = (long) page_address(page);
493                 copy_page((void *)pbe->address, (void *)pbe->orig_address);
494                 pbe++;
495         }
496         *pagedir_p = pbe;
497         return nr_copy_pages;
498 }
499
500 static int count_and_copy_data_pages(struct pbe *pagedir_p)
501 {
502         int nr_copy_pages = 0;
503         struct zone *zone;
504         for_each_zone(zone) {
505                 if (!is_highmem(zone))
506                         nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
507         }
508         return nr_copy_pages;
509 }
510
511 static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
512 {
513         unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
514         pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
515         pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
516         pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
517         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
518                 struct page *page;
519                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
520                 if (!pfn_valid(pfn))
521                         continue;
522                 page = pfn_to_page(pfn);
523                 if (!TestClearPageNosave(page))
524                         continue;
525                 else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
526                         continue;
527                 __free_page(page);
528         }
529 }
530
531 static void free_suspend_pagedir(unsigned long this_pagedir)
532 {
533         struct zone *zone;
534         for_each_zone(zone) {
535                 if (!is_highmem(zone))
536                         free_suspend_pagedir_zone(zone, this_pagedir);
537         }
538         free_pages(this_pagedir, pagedir_order);
539 }
540
541 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
542 {
543         int i;
544         suspend_pagedir_t *pagedir;
545         struct pbe *p;
546         struct page *page;
547
548         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
549
550         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
551         if (!pagedir)
552                 return NULL;
553
554         page = virt_to_page(pagedir);
555         for(i=0; i < 1<<pagedir_order; i++)
556                 SetPageNosave(page++);
557                 
558         while(nr_copy_pages--) {
559                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
560                 if (!p->address) {
561                         free_suspend_pagedir((unsigned long) pagedir);
562                         return NULL;
563                 }
564                 SetPageNosave(virt_to_page(p->address));
565                 p->orig_address = 0;
566                 p++;
567         }
568         return pagedir;
569 }
570
571 static int prepare_suspend_processes(void)
572 {
573         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
574         if (freeze_processes()) {
575                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
576                 thaw_processes();
577                 return 1;
578         }
579         return 0;
580 }
581
582 /*
583  * Try to free as much memory as possible, but do not OOM-kill anyone
584  *
585  * Notice: all userland should be stopped at this point, or livelock is possible.
586  */
587 static void free_some_memory(void)
588 {
589         printk("Freeing memory: ");
590         while (shrink_all_memory(10000))
591                 printk(".");
592         printk("|\n");
593 }
594
595 static int suspend_prepare_image(void)
596 {
597         struct sysinfo i;
598         unsigned int nr_needed_pages = 0;
599
600         pagedir_nosave = NULL;
601         printk( "/critical section: ");
602 #ifdef CONFIG_HIGHMEM
603         printk( "handling highmem" );
604         if (save_highmem()) {
605                 printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
606                 return -ENOMEM;
607         }
608         printk(", ");
609 #endif
610
611         printk("counting pages to copy" );
612         drain_local_pages();
613         nr_copy_pages = count_and_copy_data_pages(NULL);
614         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
615         
616         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
617         if(nr_free_pages() < nr_needed_pages) {
618                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
619                        name_suspend, nr_needed_pages-nr_free_pages());
620                 root_swap = 0xFFFF;
621                 return -ENOMEM;
622         }
623         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
624                                    We should only consider resume_device. */
625         if (i.freeswap < nr_needed_pages)  {
626                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
627                        name_suspend, nr_needed_pages-i.freeswap);
628                 return -ENOSPC;
629         }
630
631         PRINTK( "Alloc pagedir\n" ); 
632         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
633         if (!pagedir_nosave) {
634                 /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
635                 printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
636                 return -ENOMEM;
637         }
638         nr_copy_pages_check = nr_copy_pages;
639         pagedir_order_check = pagedir_order;
640
641         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
642         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
643                 BUG();
644
645         /*
646          * End of critical section. From now on, we can write to memory,
647          * but we should not touch disk. This specially means we must _not_
648          * touch swap space! Except we must write out our image of course.
649          */
650
651         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
652         return 0;
653 }
654
655 static void suspend_save_image(void)
656 {
657         device_resume();
658
659         lock_swapdevices();
660         write_suspend_image();
661         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
662
663         /* It is important _NOT_ to umount filesystems at this point. We want
664          * them synced (in case something goes wrong) but we DO not want to mark
665          * filesystem clean: it is not. (And it does not matter, if we resume
666          * correctly, we'll mark system clean, anyway.)
667          */
668 }
669
670 static void suspend_power_down(void)
671 {
672         extern int C_A_D;
673         C_A_D = 0;
674         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
675 #ifdef CONFIG_VT
676         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
677         mdelay(1000);
678         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
679                 machine_restart(NULL);
680         else
681 #endif
682         {
683                 device_shutdown();
684                 machine_power_off();
685         }
686
687         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
688         machine_halt();
689         while (1);
690         /* NOTREACHED */
691 }
692
693 /*
694  * Magic happens here
695  */
696
697 asmlinkage void do_magic_resume_1(void)
698 {
699         barrier();
700         mb();
701         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
702
703         device_power_down(4);
704         PRINTK( "Waiting for DMAs to settle down...\n");
705         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
706                            Do it with disabled interrupts for best effect. That way, if some
707                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
708 }
709
710 asmlinkage void do_magic_resume_2(void)
711 {
712         BUG_ON (nr_copy_pages_check != nr_copy_pages);
713         BUG_ON (pagedir_order_check != pagedir_order);
714
715         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
716
717         PRINTK( "Freeing prev allocated pagedir\n" );
718         free_suspend_pagedir((unsigned long) pagedir_save);
719
720 #ifdef CONFIG_HIGHMEM
721         printk( "Restoring highmem\n" );
722         restore_highmem();
723 #endif
724         printk("done, devices\n");
725
726         device_power_up();
727         spin_unlock_irq(&suspend_pagedir_lock);
728         device_resume();
729
730         /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
731         PRINTK( "Fixing swap signatures... " );
732         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
733         PRINTK( "ok\n" );
734
735 #ifdef SUSPEND_CONSOLE
736         acquire_console_sem();
737         update_screen(fg_console);
738         release_console_sem();
739 #endif
740 }
741
742 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
743
744         if (!resume) {
745                 do_magic_suspend_1();
746                 save_processor_state();
747                 SAVE_REGISTERS
748                 do_magic_suspend_2();
749                 return;
750         }
751         GO_TO_SWAPPER_PAGE_TABLES
752         do_magic_resume_1();
753         COPY_PAGES_BACK
754         RESTORE_REGISTERS
755         restore_processor_state();
756         do_magic_resume_2();
757
758  */
759
760 asmlinkage void do_magic_suspend_1(void)
761 {
762         mb();
763         barrier();
764         BUG_ON(in_atomic());
765         spin_lock_irq(&suspend_pagedir_lock);
766 }
767
768 asmlinkage void do_magic_suspend_2(void)
769 {
770         int is_problem;
771         read_swapfiles();
772         device_power_down(4);
773         is_problem = suspend_prepare_image();
774         device_power_up();
775         spin_unlock_irq(&suspend_pagedir_lock);
776         if (!is_problem) {
777                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
778                 BUG_ON(in_atomic());
779                 suspend_save_image();
780                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
781         }
782
783         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
784         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
785
786         barrier();
787         mb();
788         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
789         mdelay(1000);
790
791         free_pages((unsigned long) pagedir_nosave, pagedir_order);
792         spin_unlock_irq(&suspend_pagedir_lock);
793
794         device_resume();
795         PRINTK( "Fixing swap signatures... " );
796         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
797         PRINTK( "ok\n" );
798 }
799
800 /*
801  * This is main interface to the outside world. It needs to be
802  * called from process context.
803  */
804 int software_suspend(void)
805 {
806         int res;
807         if (!software_suspend_enabled)
808                 return -EAGAIN;
809
810         software_suspend_enabled = 0;
811         might_sleep();
812
813         if (arch_prepare_suspend()) {
814                 printk("%sArchitecture failed to prepare\n", name_suspend);
815                 return -EPERM;
816         }               
817         if (pm_prepare_console())
818                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
819         if (!prepare_suspend_processes()) {
820
821                 /* At this point, all user processes and "dangerous"
822                    kernel threads are stopped. Free some memory, as we
823                    need half of memory free. */
824
825                 free_some_memory();
826                 
827                 /* Save state of all device drivers, and stop them. */             
828                 if ((res = device_suspend(4))==0)
829                         /* If stopping device drivers worked, we proceed basically into
830                          * suspend_save_image.
831                          *
832                          * do_magic(0) returns after system is resumed.
833                          *
834                          * do_magic() copies all "used" memory to "free" memory, then
835                          * unsuspends all device drivers, and writes memory to disk
836                          * using normal kernel mechanism.
837                          */
838                         do_magic(0);
839                 thaw_processes();
840         } else
841                 res = -EBUSY;
842         software_suspend_enabled = 1;
843         MDELAY(1000);
844         pm_restore_console();
845         return res;
846 }
847
848 /* More restore stuff */
849
850 /* FIXME: Why not memcpy(to, from, 1<<pagedir_order*PAGE_SIZE)? */
851 static void copy_pagedir(suspend_pagedir_t *to, suspend_pagedir_t *from)
852 {
853         int i;
854         char *topointer=(char *)to, *frompointer=(char *)from;
855
856         for(i=0; i < 1 << pagedir_order; i++) {
857                 copy_page(topointer, frompointer);
858                 topointer += PAGE_SIZE;
859                 frompointer += PAGE_SIZE;
860         }
861 }
862
863 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
864
865 /*
866  * Returns true if given address/order collides with any orig_address 
867  */
868 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
869                 int order)
870 {
871         int i;
872         unsigned long addre = addr + (PAGE_SIZE<<order);
873         
874         for(i=0; i < nr_copy_pages; i++)
875                 if((pagedir+i)->orig_address >= addr &&
876                         (pagedir+i)->orig_address < addre)
877                         return 1;
878
879         return 0;
880 }
881
882 /*
883  * We check here that pagedir & pages it points to won't collide with pages
884  * where we're going to restore from the loaded pages later
885  */
886 static int check_pagedir(void)
887 {
888         int i;
889
890         for(i=0; i < nr_copy_pages; i++) {
891                 unsigned long addr;
892
893                 do {
894                         addr = get_zeroed_page(GFP_ATOMIC);
895                         if(!addr)
896                                 return -ENOMEM;
897                 } while (does_collide(addr));
898
899                 (pagedir_nosave+i)->address = addr;
900         }
901         return 0;
902 }
903
904 static int relocate_pagedir(void)
905 {
906         /*
907          * We have to avoid recursion (not to overflow kernel stack),
908          * and that's why code looks pretty cryptic 
909          */
910         suspend_pagedir_t *new_pagedir, *old_pagedir = pagedir_nosave;
911         void **eaten_memory = NULL;
912         void **c = eaten_memory, *m, *f;
913
914         printk("Relocating pagedir");
915
916         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
917                 printk("not necessary\n");
918                 return 0;
919         }
920
921         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
922                 memset(m, 0, PAGE_SIZE);
923                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
924                         break;
925                 eaten_memory = m;
926                 printk( "." ); 
927                 *eaten_memory = c;
928                 c = eaten_memory;
929         }
930
931         if (!m)
932                 return -ENOMEM;
933
934         pagedir_nosave = new_pagedir = m;
935         copy_pagedir(new_pagedir, old_pagedir);
936
937         c = eaten_memory;
938         while(c) {
939                 printk(":");
940                 f = *c;
941                 c = *c;
942                 if (f)
943                         free_pages((unsigned long)f, pagedir_order);
944         }
945         printk("|\n");
946         return 0;
947 }
948
949 /*
950  * Sanity check if this image makes sense with this kernel/swap context
951  * I really don't think that it's foolproof but more than nothing..
952  */
953
954 static int sanity_check_failed(char *reason)
955 {
956         printk(KERN_ERR "%s%s\n", name_resume, reason);
957         return -EPERM;
958 }
959
960 static int sanity_check(struct suspend_header *sh)
961 {
962         if (sh->version_code != LINUX_VERSION_CODE)
963                 return sanity_check_failed("Incorrect kernel version");
964         if (sh->num_physpages != num_physpages)
965                 return sanity_check_failed("Incorrect memory size");
966         if (strncmp(sh->machine, system_utsname.machine, 8))
967                 return sanity_check_failed("Incorrect machine type");
968         if (strncmp(sh->version, system_utsname.version, 20))
969                 return sanity_check_failed("Incorrect version");
970         if (sh->num_cpus != num_online_cpus())
971                 return sanity_check_failed("Incorrect number of cpus");
972         if (sh->page_size != PAGE_SIZE)
973                 return sanity_check_failed("Incorrect PAGE_SIZE");
974         return 0;
975 }
976
977 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
978 {
979         struct buffer_head *bh;
980         BUG_ON (pos%PAGE_SIZE);
981         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
982         if (!bh || (!bh->b_data)) {
983                 return -1;
984         }
985         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
986         BUG_ON(!buffer_uptodate(bh));
987         brelse(bh);
988         return 0;
989
990
991 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
992 {
993 #if 0
994         struct buffer_head *bh;
995         BUG_ON (pos%PAGE_SIZE);
996         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
997         if (!bh || (!bh->b_data)) {
998                 return -1;
999         }
1000         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
1001         BUG_ON(!buffer_uptodate(bh));
1002         generic_make_request(WRITE, bh);
1003         if (!buffer_uptodate(bh))
1004                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
1005         wait_on_buffer(bh);
1006         brelse(bh);
1007         return 0;
1008 #endif
1009         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
1010         return 0;
1011 }
1012
1013 extern dev_t __init name_to_dev_t(const char *line);
1014
1015 static int __init __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
1016 {
1017         swp_entry_t next;
1018         int i, nr_pgdir_pages;
1019
1020 #define PREPARENEXT \
1021         {       next = cur->link.next; \
1022                 next.val = swp_offset(next) * PAGE_SIZE; \
1023         }
1024
1025         if (bdev_read_page(bdev, 0, cur)) return -EIO;
1026
1027         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
1028             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
1029                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
1030                 return -EINVAL;
1031         }
1032
1033         PREPARENEXT; /* We have to read next position before we overwrite it */
1034
1035         if (!memcmp("S1",cur->swh.magic.magic,2))
1036                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
1037         else if (!memcmp("S2",cur->swh.magic.magic,2))
1038                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
1039         else {
1040                 if (noresume)
1041                         return -EINVAL;
1042                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
1043                         name_resume, cur->swh.magic.magic);
1044         }
1045         if (noresume) {
1046                 /* We don't do a sanity check here: we want to restore the swap
1047                    whatever version of kernel made the suspend image;
1048                    We need to write swap, but swap is *not* enabled so
1049                    we must write the device directly */
1050                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
1051                 bdev_write_page(bdev, 0, cur);
1052         }
1053
1054         printk( "%sSignature found, resuming\n", name_resume );
1055         MDELAY(1000);
1056
1057         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1058         if (sanity_check(&cur->sh))     /* Is this same machine? */     
1059                 return -EPERM;
1060         PREPARENEXT;
1061
1062         pagedir_save = cur->sh.suspend_pagedir;
1063         nr_copy_pages = cur->sh.num_pbes;
1064         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
1065         pagedir_order = get_bitmask_order(nr_pgdir_pages);
1066
1067         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
1068         if (!pagedir_nosave)
1069                 return -ENOMEM;
1070
1071         PRINTK( "%sReading pagedir, ", name_resume );
1072
1073         /* We get pages in reverse order of saving! */
1074         for (i=nr_pgdir_pages-1; i>=0; i--) {
1075                 BUG_ON (!next.val);
1076                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
1077                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1078                 PREPARENEXT;
1079         }
1080         BUG_ON (next.val);
1081
1082         if (relocate_pagedir())
1083                 return -ENOMEM;
1084         if (check_pagedir())
1085                 return -ENOMEM;
1086
1087         printk( "Reading image data (%d pages): ", nr_copy_pages );
1088         for(i=0; i < nr_copy_pages; i++) {
1089                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
1090                 if (!(i%100))
1091                         printk( "." );
1092                 /* You do not need to check for overlaps...
1093                    ... check_pagedir already did this work */
1094                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
1095                         return -EIO;
1096         }
1097         printk( "|\n" );
1098         return 0;
1099 }
1100
1101 static int read_suspend_image(const char * specialfile, int noresume)
1102 {
1103         union diskpage *cur;
1104         unsigned long scratch_page = 0;
1105         int error;
1106         char b[BDEVNAME_SIZE];
1107
1108         resume_device = name_to_dev_t(specialfile);
1109         scratch_page = get_zeroed_page(GFP_ATOMIC);
1110         cur = (void *) scratch_page;
1111         if (cur) {
1112                 struct block_device *bdev;
1113                 printk("Resuming from device %s\n",
1114                                 __bdevname(resume_device, b));
1115                 bdev = open_by_devnum(resume_device, FMODE_READ);
1116                 if (IS_ERR(bdev)) {
1117                         error = PTR_ERR(bdev);
1118                 } else {
1119                         set_blocksize(bdev, PAGE_SIZE);
1120                         error = __read_suspend_image(bdev, cur, noresume);
1121                         blkdev_put(bdev);
1122                 }
1123         } else error = -ENOMEM;
1124
1125         if (scratch_page)
1126                 free_page(scratch_page);
1127         switch (error) {
1128                 case 0:
1129                         PRINTK("Reading resume file was successful\n");
1130                         break;
1131                 case -EINVAL:
1132                         break;
1133                 case -EIO:
1134                         printk( "%sI/O error\n", name_resume);
1135                         break;
1136                 case -ENOENT:
1137                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1138                         break;
1139                 case -ENOMEM:
1140                         printk( "%sNot enough memory\n", name_resume);
1141                         break;
1142                 default:
1143                         printk( "%sError %d resuming\n", name_resume, error );
1144         }
1145         MDELAY(1000);
1146         return error;
1147 }
1148
1149 /**
1150  *      software_resume - Resume from a saved image.
1151  *
1152  *      Called as a late_initcall (so all devices are discovered and 
1153  *      initialized), we call swsusp to see if we have a saved image or not.
1154  *      If so, we quiesce devices, then restore the saved image. We will 
1155  *      return above (in pm_suspend_disk() ) if everything goes well. 
1156  *      Otherwise, we fail gracefully and return to the normally 
1157  *      scheduled program.
1158  *
1159  */
1160 static int __init software_resume(void)
1161 {
1162         if (num_online_cpus() > 1) {
1163                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1164                 return -EINVAL;
1165         }
1166         /* We enable the possibility of machine suspend */
1167         software_suspend_enabled = 1;
1168         if (!resume_status)
1169                 return 0;
1170
1171         printk( "%s", name_resume );
1172         if (resume_status == NORESUME) {
1173                 if(resume_file[0])
1174                         read_suspend_image(resume_file, 1);
1175                 printk( "disabled\n" );
1176                 return 0;
1177         }
1178         MDELAY(1000);
1179
1180         if (pm_prepare_console())
1181                 printk("swsusp: Can't allocate a console... proceeding\n");
1182
1183         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1184                 printk( "suspension device unspecified\n" );
1185                 return -EINVAL;
1186         }
1187
1188         printk( "resuming from %s\n", resume_file);
1189         if (read_suspend_image(resume_file, 0))
1190                 goto read_failure;
1191         device_suspend(4);
1192         do_magic(1);
1193         panic("This never returns");
1194
1195 read_failure:
1196         pm_restore_console();
1197         return 0;
1198 }
1199
1200 late_initcall(software_resume);
1201
1202 static int __init resume_setup(char *str)
1203 {
1204         if (resume_status == NORESUME)
1205                 return 1;
1206
1207         strncpy( resume_file, str, 255 );
1208         resume_status = RESUME_SPECIFIED;
1209
1210         return 1;
1211 }
1212
1213 static int __init noresume_setup(char *str)
1214 {
1215         resume_status = NORESUME;
1216         return 1;
1217 }
1218
1219 __setup("noresume", noresume_setup);
1220 __setup("resume=", resume_setup);
1221
1222 EXPORT_SYMBOL(software_suspend);
1223 EXPORT_SYMBOL(software_suspend_enabled);