patch-2_6_7-vs1_9_1_12
[linux-2.6.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/syscalls.h>
63 #include <linux/console.h>
64 #include <linux/highmem.h>
65
66 #include <asm/uaccess.h>
67 #include <asm/mmu_context.h>
68 #include <asm/pgtable.h>
69 #include <asm/io.h>
70
71 #include "power.h"
72
73 unsigned char software_suspend_enabled = 0;
74
75 #define NORESUME                1
76 #define RESUME_SPECIFIED        2
77
78 /* References to section boundaries */
79 extern char __nosave_begin, __nosave_end;
80
81 extern int is_head_of_free_region(struct page *);
82
83 /* Locks */
84 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
85
86 /* Variables to be preserved over suspend */
87 static int pagedir_order_check;
88 static int nr_copy_pages_check;
89
90 static int resume_status;
91 static char resume_file[256] = "";                      /* For resume= kernel option */
92 static dev_t resume_device;
93 /* Local variables that should not be affected by save */
94 unsigned int nr_copy_pages __nosavedata = 0;
95
96 /* Suspend pagedir is allocated before final copy, therefore it
97    must be freed after resume 
98
99    Warning: this is evil. There are actually two pagedirs at time of
100    resume. One is "pagedir_save", which is empty frame allocated at
101    time of suspend, that must be freed. Second is "pagedir_nosave", 
102    allocated at time of resume, that travels through memory not to
103    collide with anything.
104
105    Warning: this is even more evil than it seems. Pagedirs this file
106    talks about are completely different from page directories used by
107    MMU hardware.
108  */
109 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
110 static suspend_pagedir_t *pagedir_save;
111 static int pagedir_order __nosavedata = 0;
112
113 struct link {
114         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
115         swp_entry_t next;
116 };
117
118 union diskpage {
119         union swap_header swh;
120         struct link link;
121         struct suspend_header sh;
122 };
123
124 /*
125  * XXX: We try to keep some more pages free so that I/O operations succeed
126  * without paging. Might this be more?
127  */
128 #define PAGES_FOR_IO    512
129
130 static const char name_suspend[] = "Suspend Machine: ";
131 static const char name_resume[] = "Resume Machine: ";
132
133 /*
134  * Debug
135  */
136 #define DEBUG_DEFAULT
137 #undef  DEBUG_PROCESS
138 #undef  DEBUG_SLOW
139 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
140
141 #ifdef DEBUG_DEFAULT
142 # define PRINTK(f, a...)        printk(f, ## a)
143 #else
144 # define PRINTK(f, a...)        do { } while(0)
145 #endif
146
147 #ifdef DEBUG_SLOW
148 #define MDELAY(a) mdelay(a)
149 #else
150 #define MDELAY(a) do { } while(0)
151 #endif
152
153 /*
154  * Saving part...
155  */
156
157 static __inline__ int fill_suspend_header(struct suspend_header *sh)
158 {
159         memset((char *)sh, 0, sizeof(*sh));
160
161         sh->version_code = LINUX_VERSION_CODE;
162         sh->num_physpages = num_physpages;
163         strncpy(sh->machine, system_utsname.machine, 8);
164         strncpy(sh->version, system_utsname.version, 20);
165         /* FIXME: Is this bogus? --RR */
166         sh->num_cpus = num_online_cpus();
167         sh->page_size = PAGE_SIZE;
168         sh->suspend_pagedir = pagedir_nosave;
169         BUG_ON (pagedir_save != pagedir_nosave);
170         sh->num_pbes = nr_copy_pages;
171         /* TODO: needed? mounted fs' last mounted date comparison
172          * [so they haven't been mounted since last suspend.
173          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
174          */
175         return 0;
176 }
177
178 /* We memorize in swapfile_used what swap devices are used for suspension */
179 #define SWAPFILE_UNUSED    0
180 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
181 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
182
183 static unsigned short swapfile_used[MAX_SWAPFILES];
184 static unsigned short root_swap;
185 #define MARK_SWAP_SUSPEND 0
186 #define MARK_SWAP_RESUME 2
187
188 static void mark_swapfiles(swp_entry_t prev, int mode)
189 {
190         swp_entry_t entry;
191         union diskpage *cur;
192         struct page *page;
193
194         if (root_swap == 0xFFFF)  /* ignored */
195                 return;
196
197         page = alloc_page(GFP_ATOMIC);
198         if (!page)
199                 panic("Out of memory in mark_swapfiles");
200         cur = page_address(page);
201         /* XXX: this is dirty hack to get first page of swap file */
202         entry = swp_entry(root_swap, 0);
203         rw_swap_page_sync(READ, entry, page);
204
205         if (mode == MARK_SWAP_RESUME) {
206                 if (!memcmp("S1",cur->swh.magic.magic,2))
207                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
208                 else if (!memcmp("S2",cur->swh.magic.magic,2))
209                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
210                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
211                         name_resume, cur->swh.magic.magic);
212         } else {
213                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
214                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
215                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
216                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
217                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
218                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
219                 /* link.next lies *no more* in last 4/8 bytes of magic */
220         }
221         rw_swap_page_sync(WRITE, entry, page);
222         __free_page(page);
223 }
224
225
226 /*
227  * Check whether the swap device is the specified resume
228  * device, irrespective of whether they are specified by
229  * identical names.
230  *
231  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
232  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
233  * and they'll be considered the same device.  This is *necessary* for
234  * devfs, since the resume code can only recognize the form /dev/hda4,
235  * but the suspend code would see the long name.)
236  */
237 static int is_resume_device(const struct swap_info_struct *swap_info)
238 {
239         struct file *file = swap_info->swap_file;
240         struct inode *inode = file->f_dentry->d_inode;
241
242         return S_ISBLK(inode->i_mode) &&
243                 resume_device == MKDEV(imajor(inode), iminor(inode));
244 }
245
246 static void read_swapfiles(void) /* This is called before saving image */
247 {
248         int i, len;
249         
250         len=strlen(resume_file);
251         root_swap = 0xFFFF;
252         
253         swap_list_lock();
254         for(i=0; i<MAX_SWAPFILES; i++) {
255                 if (swap_info[i].flags == 0) {
256                         swapfile_used[i]=SWAPFILE_UNUSED;
257                 } else {
258                         if(!len) {
259                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
260                                 if(root_swap == 0xFFFF) {
261                                         swapfile_used[i] = SWAPFILE_SUSPEND;
262                                         root_swap = i;
263                                 } else
264                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
265                         } else {
266                                 /* we ignore all swap devices that are not the resume_file */
267                                 if (is_resume_device(&swap_info[i])) {
268                                         swapfile_used[i] = SWAPFILE_SUSPEND;
269                                         root_swap = i;
270                                 } else {
271                                         swapfile_used[i] = SWAPFILE_IGNORED;
272                                 }
273                         }
274                 }
275         }
276         swap_list_unlock();
277 }
278
279 static void lock_swapdevices(void) /* This is called after saving image so modification
280                                       will be lost after resume... and that's what we want. */
281 {
282         int i;
283
284         swap_list_lock();
285         for(i = 0; i< MAX_SWAPFILES; i++)
286                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
287                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
288                                                        lock_swapdevices can unlock the devices. */
289                 }
290         swap_list_unlock();
291 }
292
293 /**
294  *    write_suspend_image - Write entire image to disk.
295  *
296  *    After writing suspend signature to the disk, suspend may no
297  *    longer fail: we have ready-to-run image in swap, and rollback
298  *    would happen on next reboot -- corrupting data.
299  *
300  *    Note: The buffer we allocate to use to write the suspend header is
301  *    not freed; its not needed since the system is going down anyway
302  *    (plus it causes an oops and I'm lazy^H^H^H^Htoo busy).
303  */
304 static int write_suspend_image(void)
305 {
306         int i;
307         swp_entry_t entry, prev = { 0 };
308         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
309         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
310         unsigned long address;
311         struct page *page;
312
313         if (!buffer)
314                 return -ENOMEM;
315
316         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
317         for (i=0; i<nr_copy_pages; i++) {
318                 if (!(i%100))
319                         printk( "." );
320                 if (!(entry = get_swap_page()).val)
321                         panic("\nNot enough swapspace when writing data" );
322                 
323                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
324                         panic("\nPage %d: not enough swapspace on suspend device", i );
325             
326                 address = (pagedir_nosave+i)->address;
327                 page = virt_to_page(address);
328                 rw_swap_page_sync(WRITE, entry, page);
329                 (pagedir_nosave+i)->swap_address = entry;
330         }
331         printk( "|\n" );
332         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
333         for (i=0; i<nr_pgdir_pages; i++) {
334                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
335                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
336                 printk( "." );
337                 if (!(entry = get_swap_page()).val) {
338                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
339                         panic("Don't know how to recover");
340                         free_page((unsigned long) buffer);
341                         return -ENOSPC;
342                 }
343
344                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
345                         panic("\nNot enough swapspace for pagedir on suspend device" );
346
347                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
348                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
349
350                 cur->link.next = prev;                          
351                 page = virt_to_page((unsigned long)cur);
352                 rw_swap_page_sync(WRITE, entry, page);
353                 prev = entry;
354         }
355         printk("H");
356         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
357         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
358         BUG_ON (sizeof(struct link) != PAGE_SIZE);
359         if (!(entry = get_swap_page()).val)
360                 panic( "\nNot enough swapspace when writing header" );
361         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
362                 panic("\nNot enough swapspace for header on suspend device" );
363
364         cur = (void *) buffer;
365         if (fill_suspend_header(&cur->sh))
366                 BUG();          /* Not a BUG_ON(): we want fill_suspend_header to be called, always */
367                 
368         cur->link.next = prev;
369
370         page = virt_to_page((unsigned long)cur);
371         rw_swap_page_sync(WRITE, entry, page);
372         prev = entry;
373
374         printk( "S" );
375         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
376         printk( "|\n" );
377
378         MDELAY(1000);
379         return 0;
380 }
381
382 #ifdef CONFIG_HIGHMEM
383 struct highmem_page {
384         char *data;
385         struct page *page;
386         struct highmem_page *next;
387 };
388
389 struct highmem_page *highmem_copy = NULL;
390
391 static int save_highmem_zone(struct zone *zone)
392 {
393         unsigned long zone_pfn;
394         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
395                 struct page *page;
396                 struct highmem_page *save;
397                 void *kaddr;
398                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
399                 int chunk_size;
400
401                 if (!(pfn%1000))
402                         printk(".");
403                 if (!pfn_valid(pfn))
404                         continue;
405                 page = pfn_to_page(pfn);
406                 /*
407                  * This condition results from rvmalloc() sans vmalloc_32()
408                  * and architectural memory reservations. This should be
409                  * corrected eventually when the cases giving rise to this
410                  * are better understood.
411                  */
412                 if (PageReserved(page)) {
413                         printk("highmem reserved page?!\n");
414                         continue;
415                 }
416                 if ((chunk_size = is_head_of_free_region(page))) {
417                         pfn += chunk_size - 1;
418                         zone_pfn += chunk_size - 1;
419                         continue;
420                 }
421                 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
422                 if (!save)
423                         return -ENOMEM;
424                 save->next = highmem_copy;
425                 save->page = page;
426                 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
427                 if (!save->data) {
428                         kfree(save);
429                         return -ENOMEM;
430                 }
431                 kaddr = kmap_atomic(page, KM_USER0);
432                 memcpy(save->data, kaddr, PAGE_SIZE);
433                 kunmap_atomic(kaddr, KM_USER0);
434                 highmem_copy = save;
435         }
436         return 0;
437 }
438
439 static int save_highmem(void)
440 {
441         struct zone *zone;
442         int res = 0;
443         for_each_zone(zone) {
444                 if (is_highmem(zone))
445                         res = save_highmem_zone(zone);
446                 if (res)
447                         return res;
448         }
449         return 0;
450 }
451
452 static int restore_highmem(void)
453 {
454         while (highmem_copy) {
455                 struct highmem_page *save = highmem_copy;
456                 void *kaddr;
457                 highmem_copy = save->next;
458
459                 kaddr = kmap_atomic(save->page, KM_USER0);
460                 memcpy(kaddr, save->data, PAGE_SIZE);
461                 kunmap_atomic(kaddr, KM_USER0);
462                 free_page((long) save->data);
463                 kfree(save);
464         }
465         return 0;
466 }
467 #endif
468
469 static int pfn_is_nosave(unsigned long pfn)
470 {
471         unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
472         unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
473         return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
474 }
475
476 /* if *pagedir_p != NULL it also copies the counted pages */
477 static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
478 {
479         unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
480         struct pbe *pbe = *pagedir_p;
481         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
482                 struct page *page;
483                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
484
485                 if (!(pfn%1000))
486                         printk(".");
487                 if (!pfn_valid(pfn))
488                         continue;
489                 page = pfn_to_page(pfn);
490                 BUG_ON(PageReserved(page) && PageNosave(page));
491                 if (PageNosave(page))
492                         continue;
493                 if (PageReserved(page) && pfn_is_nosave(pfn)) {
494                         PRINTK("[nosave pfn 0x%lx]", pfn);
495                         continue;
496                 }
497                 if ((chunk_size = is_head_of_free_region(page))) {
498                         pfn += chunk_size - 1;
499                         zone_pfn += chunk_size - 1;
500                         continue;
501                 }
502                 nr_copy_pages++;
503                 if (!pbe)
504                         continue;
505                 pbe->orig_address = (long) page_address(page);
506                 copy_page((void *)pbe->address, (void *)pbe->orig_address);
507                 pbe++;
508         }
509         *pagedir_p = pbe;
510         return nr_copy_pages;
511 }
512
513 static int count_and_copy_data_pages(struct pbe *pagedir_p)
514 {
515         int nr_copy_pages = 0;
516         struct zone *zone;
517         for_each_zone(zone) {
518                 if (!is_highmem(zone))
519                         nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
520         }
521         return nr_copy_pages;
522 }
523
524 static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
525 {
526         unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
527         pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
528         pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
529         pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
530         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
531                 struct page *page;
532                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
533                 if (!pfn_valid(pfn))
534                         continue;
535                 page = pfn_to_page(pfn);
536                 if (!TestClearPageNosave(page))
537                         continue;
538                 else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
539                         continue;
540                 __free_page(page);
541         }
542 }
543
544 static void free_suspend_pagedir(unsigned long this_pagedir)
545 {
546         struct zone *zone;
547         for_each_zone(zone) {
548                 if (!is_highmem(zone))
549                         free_suspend_pagedir_zone(zone, this_pagedir);
550         }
551         free_pages(this_pagedir, pagedir_order);
552 }
553
554 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
555 {
556         int i;
557         suspend_pagedir_t *pagedir;
558         struct pbe *p;
559         struct page *page;
560
561         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
562
563         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
564         if (!pagedir)
565                 return NULL;
566
567         page = virt_to_page(pagedir);
568         for(i=0; i < 1<<pagedir_order; i++)
569                 SetPageNosave(page++);
570                 
571         while(nr_copy_pages--) {
572                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
573                 if (!p->address) {
574                         free_suspend_pagedir((unsigned long) pagedir);
575                         return NULL;
576                 }
577                 SetPageNosave(virt_to_page(p->address));
578                 p->orig_address = 0;
579                 p++;
580         }
581         return pagedir;
582 }
583
584 static int prepare_suspend_processes(void)
585 {
586         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
587         if (freeze_processes()) {
588                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
589                 thaw_processes();
590                 return 1;
591         }
592         return 0;
593 }
594
595 /*
596  * Try to free as much memory as possible, but do not OOM-kill anyone
597  *
598  * Notice: all userland should be stopped at this point, or livelock is possible.
599  */
600 static void free_some_memory(void)
601 {
602         printk("Freeing memory: ");
603         while (shrink_all_memory(10000))
604                 printk(".");
605         printk("|\n");
606 }
607
608 static int suspend_prepare_image(void)
609 {
610         struct sysinfo i;
611         unsigned int nr_needed_pages = 0;
612
613         pagedir_nosave = NULL;
614         printk( "/critical section: ");
615 #ifdef CONFIG_HIGHMEM
616         printk( "handling highmem" );
617         if (save_highmem()) {
618                 printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
619                 return -ENOMEM;
620         }
621         printk(", ");
622 #endif
623
624         printk("counting pages to copy" );
625         drain_local_pages();
626         nr_copy_pages = count_and_copy_data_pages(NULL);
627         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
628         
629         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
630         if(nr_free_pages() < nr_needed_pages) {
631                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
632                        name_suspend, nr_needed_pages-nr_free_pages());
633                 root_swap = 0xFFFF;
634                 return -ENOMEM;
635         }
636         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
637                                    We should only consider resume_device. */
638         if (i.freeswap < nr_needed_pages)  {
639                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
640                        name_suspend, nr_needed_pages-i.freeswap);
641                 return -ENOSPC;
642         }
643
644         PRINTK( "Alloc pagedir\n" ); 
645         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
646         if (!pagedir_nosave) {
647                 /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
648                 printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
649                 return -ENOMEM;
650         }
651         nr_copy_pages_check = nr_copy_pages;
652         pagedir_order_check = pagedir_order;
653
654         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
655         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
656                 BUG();
657
658         /*
659          * End of critical section. From now on, we can write to memory,
660          * but we should not touch disk. This specially means we must _not_
661          * touch swap space! Except we must write out our image of course.
662          */
663
664         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
665         return 0;
666 }
667
668 static void suspend_save_image(void)
669 {
670         device_resume();
671
672         lock_swapdevices();
673         write_suspend_image();
674         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
675
676         /* It is important _NOT_ to umount filesystems at this point. We want
677          * them synced (in case something goes wrong) but we DO not want to mark
678          * filesystem clean: it is not. (And it does not matter, if we resume
679          * correctly, we'll mark system clean, anyway.)
680          */
681 }
682
683 static void suspend_power_down(void)
684 {
685         extern int C_A_D;
686         C_A_D = 0;
687         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
688 #ifdef CONFIG_VT
689         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
690         mdelay(1000);
691         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
692                 machine_restart(NULL);
693         else
694 #endif
695         {
696                 device_shutdown();
697                 machine_power_off();
698         }
699
700         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
701         machine_halt();
702         while (1);
703         /* NOTREACHED */
704 }
705
706 /*
707  * Magic happens here
708  */
709
710 asmlinkage void do_magic_resume_1(void)
711 {
712         barrier();
713         mb();
714         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
715
716         device_power_down(4);
717         PRINTK( "Waiting for DMAs to settle down...\n");
718         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
719                            Do it with disabled interrupts for best effect. That way, if some
720                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
721 }
722
723 asmlinkage void do_magic_resume_2(void)
724 {
725         BUG_ON (nr_copy_pages_check != nr_copy_pages);
726         BUG_ON (pagedir_order_check != pagedir_order);
727
728         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
729
730         PRINTK( "Freeing prev allocated pagedir\n" );
731         free_suspend_pagedir((unsigned long) pagedir_save);
732
733 #ifdef CONFIG_HIGHMEM
734         printk( "Restoring highmem\n" );
735         restore_highmem();
736 #endif
737         printk("done, devices\n");
738
739         device_power_up();
740         spin_unlock_irq(&suspend_pagedir_lock);
741         device_resume();
742
743         /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
744         PRINTK( "Fixing swap signatures... " );
745         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
746         PRINTK( "ok\n" );
747
748 #ifdef SUSPEND_CONSOLE
749         acquire_console_sem();
750         update_screen(fg_console);
751         release_console_sem();
752 #endif
753 }
754
755 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
756
757         if (!resume) {
758                 do_magic_suspend_1();
759                 save_processor_state();
760                 SAVE_REGISTERS
761                 do_magic_suspend_2();
762                 return;
763         }
764         GO_TO_SWAPPER_PAGE_TABLES
765         do_magic_resume_1();
766         COPY_PAGES_BACK
767         RESTORE_REGISTERS
768         restore_processor_state();
769         do_magic_resume_2();
770
771  */
772
773 asmlinkage void do_magic_suspend_1(void)
774 {
775         mb();
776         barrier();
777         BUG_ON(in_atomic());
778         spin_lock_irq(&suspend_pagedir_lock);
779 }
780
781 asmlinkage void do_magic_suspend_2(void)
782 {
783         int is_problem;
784         read_swapfiles();
785         device_power_down(4);
786         is_problem = suspend_prepare_image();
787         device_power_up();
788         spin_unlock_irq(&suspend_pagedir_lock);
789         if (!is_problem) {
790                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
791                 BUG_ON(in_atomic());
792                 suspend_save_image();
793                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
794         }
795
796         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
797         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
798
799         barrier();
800         mb();
801         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
802         mdelay(1000);
803
804         free_pages((unsigned long) pagedir_nosave, pagedir_order);
805         spin_unlock_irq(&suspend_pagedir_lock);
806
807         device_resume();
808         PRINTK( "Fixing swap signatures... " );
809         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
810         PRINTK( "ok\n" );
811 }
812
813 /*
814  * This is main interface to the outside world. It needs to be
815  * called from process context.
816  */
817 int software_suspend(void)
818 {
819         int res;
820         if (!software_suspend_enabled)
821                 return -EAGAIN;
822
823         software_suspend_enabled = 0;
824         might_sleep();
825
826         if (arch_prepare_suspend()) {
827                 printk("%sArchitecture failed to prepare\n", name_suspend);
828                 return -EPERM;
829         }               
830         if (pm_prepare_console())
831                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
832         if (!prepare_suspend_processes()) {
833
834                 /* At this point, all user processes and "dangerous"
835                    kernel threads are stopped. Free some memory, as we
836                    need half of memory free. */
837
838                 free_some_memory();
839                 
840                 /* Save state of all device drivers, and stop them. */             
841                 if ((res = device_suspend(4))==0)
842                         /* If stopping device drivers worked, we proceed basically into
843                          * suspend_save_image.
844                          *
845                          * do_magic(0) returns after system is resumed.
846                          *
847                          * do_magic() copies all "used" memory to "free" memory, then
848                          * unsuspends all device drivers, and writes memory to disk
849                          * using normal kernel mechanism.
850                          */
851                         do_magic(0);
852                 thaw_processes();
853         } else
854                 res = -EBUSY;
855         software_suspend_enabled = 1;
856         MDELAY(1000);
857         pm_restore_console();
858         return res;
859 }
860
861 /* More restore stuff */
862
863 /* FIXME: Why not memcpy(to, from, 1<<pagedir_order*PAGE_SIZE)? */
864 static void copy_pagedir(suspend_pagedir_t *to, suspend_pagedir_t *from)
865 {
866         int i;
867         char *topointer=(char *)to, *frompointer=(char *)from;
868
869         for(i=0; i < 1 << pagedir_order; i++) {
870                 copy_page(topointer, frompointer);
871                 topointer += PAGE_SIZE;
872                 frompointer += PAGE_SIZE;
873         }
874 }
875
876 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
877
878 /*
879  * Returns true if given address/order collides with any orig_address 
880  */
881 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
882                 int order)
883 {
884         int i;
885         unsigned long addre = addr + (PAGE_SIZE<<order);
886         
887         for(i=0; i < nr_copy_pages; i++)
888                 if((pagedir+i)->orig_address >= addr &&
889                         (pagedir+i)->orig_address < addre)
890                         return 1;
891
892         return 0;
893 }
894
895 /*
896  * We check here that pagedir & pages it points to won't collide with pages
897  * where we're going to restore from the loaded pages later
898  */
899 static int check_pagedir(void)
900 {
901         int i;
902
903         for(i=0; i < nr_copy_pages; i++) {
904                 unsigned long addr;
905
906                 do {
907                         addr = get_zeroed_page(GFP_ATOMIC);
908                         if(!addr)
909                                 return -ENOMEM;
910                 } while (does_collide(addr));
911
912                 (pagedir_nosave+i)->address = addr;
913         }
914         return 0;
915 }
916
917 static int relocate_pagedir(void)
918 {
919         /*
920          * We have to avoid recursion (not to overflow kernel stack),
921          * and that's why code looks pretty cryptic 
922          */
923         suspend_pagedir_t *new_pagedir, *old_pagedir = pagedir_nosave;
924         void **eaten_memory = NULL;
925         void **c = eaten_memory, *m, *f;
926
927         printk("Relocating pagedir");
928
929         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
930                 printk("not necessary\n");
931                 return 0;
932         }
933
934         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
935                 memset(m, 0, PAGE_SIZE);
936                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
937                         break;
938                 eaten_memory = m;
939                 printk( "." ); 
940                 *eaten_memory = c;
941                 c = eaten_memory;
942         }
943
944         if (!m)
945                 return -ENOMEM;
946
947         pagedir_nosave = new_pagedir = m;
948         copy_pagedir(new_pagedir, old_pagedir);
949
950         c = eaten_memory;
951         while(c) {
952                 printk(":");
953                 f = *c;
954                 c = *c;
955                 if (f)
956                         free_pages((unsigned long)f, pagedir_order);
957         }
958         printk("|\n");
959         return 0;
960 }
961
962 /*
963  * Sanity check if this image makes sense with this kernel/swap context
964  * I really don't think that it's foolproof but more than nothing..
965  */
966
967 static int sanity_check_failed(char *reason)
968 {
969         printk(KERN_ERR "%s%s\n", name_resume, reason);
970         return -EPERM;
971 }
972
973 static int sanity_check(struct suspend_header *sh)
974 {
975         if (sh->version_code != LINUX_VERSION_CODE)
976                 return sanity_check_failed("Incorrect kernel version");
977         if (sh->num_physpages != num_physpages)
978                 return sanity_check_failed("Incorrect memory size");
979         if (strncmp(sh->machine, system_utsname.machine, 8))
980                 return sanity_check_failed("Incorrect machine type");
981         if (strncmp(sh->version, system_utsname.version, 20))
982                 return sanity_check_failed("Incorrect version");
983         if (sh->num_cpus != num_online_cpus())
984                 return sanity_check_failed("Incorrect number of cpus");
985         if (sh->page_size != PAGE_SIZE)
986                 return sanity_check_failed("Incorrect PAGE_SIZE");
987         return 0;
988 }
989
990 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
991 {
992         struct buffer_head *bh;
993         BUG_ON (pos%PAGE_SIZE);
994         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
995         if (!bh || (!bh->b_data)) {
996                 return -1;
997         }
998         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
999         BUG_ON(!buffer_uptodate(bh));
1000         brelse(bh);
1001         return 0;
1002
1003
1004 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
1005 {
1006 #if 0
1007         struct buffer_head *bh;
1008         BUG_ON (pos%PAGE_SIZE);
1009         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
1010         if (!bh || (!bh->b_data)) {
1011                 return -1;
1012         }
1013         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
1014         BUG_ON(!buffer_uptodate(bh));
1015         generic_make_request(WRITE, bh);
1016         if (!buffer_uptodate(bh))
1017                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
1018         wait_on_buffer(bh);
1019         brelse(bh);
1020         return 0;
1021 #endif
1022         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
1023         return 0;
1024 }
1025
1026 extern dev_t __init name_to_dev_t(const char *line);
1027
1028 static int __init __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
1029 {
1030         swp_entry_t next;
1031         int i, nr_pgdir_pages;
1032
1033 #define PREPARENEXT \
1034         {       next = cur->link.next; \
1035                 next.val = swp_offset(next) * PAGE_SIZE; \
1036         }
1037
1038         if (bdev_read_page(bdev, 0, cur)) return -EIO;
1039
1040         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
1041             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
1042                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
1043                 return -EINVAL;
1044         }
1045
1046         PREPARENEXT; /* We have to read next position before we overwrite it */
1047
1048         if (!memcmp("S1",cur->swh.magic.magic,2))
1049                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
1050         else if (!memcmp("S2",cur->swh.magic.magic,2))
1051                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
1052         else {
1053                 if (noresume)
1054                         return -EINVAL;
1055                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
1056                         name_resume, cur->swh.magic.magic);
1057         }
1058         if (noresume) {
1059                 /* We don't do a sanity check here: we want to restore the swap
1060                    whatever version of kernel made the suspend image;
1061                    We need to write swap, but swap is *not* enabled so
1062                    we must write the device directly */
1063                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
1064                 bdev_write_page(bdev, 0, cur);
1065         }
1066
1067         printk( "%sSignature found, resuming\n", name_resume );
1068         MDELAY(1000);
1069
1070         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1071         if (sanity_check(&cur->sh))     /* Is this same machine? */     
1072                 return -EPERM;
1073         PREPARENEXT;
1074
1075         pagedir_save = cur->sh.suspend_pagedir;
1076         nr_copy_pages = cur->sh.num_pbes;
1077         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
1078         pagedir_order = get_bitmask_order(nr_pgdir_pages);
1079
1080         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
1081         if (!pagedir_nosave)
1082                 return -ENOMEM;
1083
1084         PRINTK( "%sReading pagedir, ", name_resume );
1085
1086         /* We get pages in reverse order of saving! */
1087         for (i=nr_pgdir_pages-1; i>=0; i--) {
1088                 BUG_ON (!next.val);
1089                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
1090                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1091                 PREPARENEXT;
1092         }
1093         BUG_ON (next.val);
1094
1095         if (relocate_pagedir())
1096                 return -ENOMEM;
1097         if (check_pagedir())
1098                 return -ENOMEM;
1099
1100         printk( "Reading image data (%d pages): ", nr_copy_pages );
1101         for(i=0; i < nr_copy_pages; i++) {
1102                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
1103                 if (!(i%100))
1104                         printk( "." );
1105                 /* You do not need to check for overlaps...
1106                    ... check_pagedir already did this work */
1107                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
1108                         return -EIO;
1109         }
1110         printk( "|\n" );
1111         return 0;
1112 }
1113
1114 static int __init read_suspend_image(const char * specialfile, int noresume)
1115 {
1116         union diskpage *cur;
1117         unsigned long scratch_page = 0;
1118         int error;
1119         char b[BDEVNAME_SIZE];
1120
1121         resume_device = name_to_dev_t(specialfile);
1122         scratch_page = get_zeroed_page(GFP_ATOMIC);
1123         cur = (void *) scratch_page;
1124         if (cur) {
1125                 struct block_device *bdev;
1126                 printk("Resuming from device %s\n",
1127                                 __bdevname(resume_device, b));
1128                 bdev = open_by_devnum(resume_device, FMODE_READ);
1129                 if (IS_ERR(bdev)) {
1130                         error = PTR_ERR(bdev);
1131                 } else {
1132                         set_blocksize(bdev, PAGE_SIZE);
1133                         error = __read_suspend_image(bdev, cur, noresume);
1134                         blkdev_put(bdev);
1135                 }
1136         } else error = -ENOMEM;
1137
1138         if (scratch_page)
1139                 free_page(scratch_page);
1140         switch (error) {
1141                 case 0:
1142                         PRINTK("Reading resume file was successful\n");
1143                         break;
1144                 case -EINVAL:
1145                         break;
1146                 case -EIO:
1147                         printk( "%sI/O error\n", name_resume);
1148                         break;
1149                 case -ENOENT:
1150                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1151                         break;
1152                 case -ENOMEM:
1153                         printk( "%sNot enough memory\n", name_resume);
1154                         break;
1155                 default:
1156                         printk( "%sError %d resuming\n", name_resume, error );
1157         }
1158         MDELAY(1000);
1159         return error;
1160 }
1161
1162 /**
1163  *      software_resume - Resume from a saved image.
1164  *
1165  *      Called as a late_initcall (so all devices are discovered and 
1166  *      initialized), we call swsusp to see if we have a saved image or not.
1167  *      If so, we quiesce devices, then restore the saved image. We will 
1168  *      return above (in pm_suspend_disk() ) if everything goes well. 
1169  *      Otherwise, we fail gracefully and return to the normally 
1170  *      scheduled program.
1171  *
1172  */
1173 static int __init software_resume(void)
1174 {
1175         if (num_online_cpus() > 1) {
1176                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1177                 return -EINVAL;
1178         }
1179         /* We enable the possibility of machine suspend */
1180         software_suspend_enabled = 1;
1181         if (!resume_status)
1182                 return 0;
1183
1184         printk( "%s", name_resume );
1185         if (resume_status == NORESUME) {
1186                 if(resume_file[0])
1187                         read_suspend_image(resume_file, 1);
1188                 printk( "disabled\n" );
1189                 return 0;
1190         }
1191         MDELAY(1000);
1192
1193         if (pm_prepare_console())
1194                 printk("swsusp: Can't allocate a console... proceeding\n");
1195
1196         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1197                 printk( "suspension device unspecified\n" );
1198                 return -EINVAL;
1199         }
1200
1201         printk( "resuming from %s\n", resume_file);
1202         if (read_suspend_image(resume_file, 0))
1203                 goto read_failure;
1204         device_suspend(4);
1205         do_magic(1);
1206         panic("This never returns");
1207
1208 read_failure:
1209         pm_restore_console();
1210         return 0;
1211 }
1212
1213 late_initcall(software_resume);
1214
1215 static int __init resume_setup(char *str)
1216 {
1217         if (resume_status == NORESUME)
1218                 return 1;
1219
1220         strncpy( resume_file, str, 255 );
1221         resume_status = RESUME_SPECIFIED;
1222
1223         return 1;
1224 }
1225
1226 static int __init noresume_setup(char *str)
1227 {
1228         resume_status = NORESUME;
1229         return 1;
1230 }
1231
1232 __setup("noresume", noresume_setup);
1233 __setup("resume=", resume_setup);
1234
1235 EXPORT_SYMBOL(software_suspend);
1236 EXPORT_SYMBOL(software_suspend_enabled);