VServer 1.9.2 (patch-2.6.8.1-vs1.9.2.diff)
[linux-2.6.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/power/swsusp.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/syscalls.h>
63 #include <linux/console.h>
64 #include <linux/highmem.h>
65
66 #include <asm/uaccess.h>
67 #include <asm/mmu_context.h>
68 #include <asm/pgtable.h>
69 #include <asm/io.h>
70
71 #include "power.h"
72
73 unsigned char software_suspend_enabled = 0;
74
75 #define NORESUME                1
76 #define RESUME_SPECIFIED        2
77
78 /* References to section boundaries */
79 extern char __nosave_begin, __nosave_end;
80
81 extern int is_head_of_free_region(struct page *);
82
83 /* Locks */
84 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
85
86 /* Variables to be preserved over suspend */
87 static int pagedir_order_check;
88 static int nr_copy_pages_check;
89
90 static int resume_status;
91 static char resume_file[256] = "";                      /* For resume= kernel option */
92 static dev_t resume_device;
93 /* Local variables that should not be affected by save */
94 unsigned int nr_copy_pages __nosavedata = 0;
95
96 /* Suspend pagedir is allocated before final copy, therefore it
97    must be freed after resume 
98
99    Warning: this is evil. There are actually two pagedirs at time of
100    resume. One is "pagedir_save", which is empty frame allocated at
101    time of suspend, that must be freed. Second is "pagedir_nosave", 
102    allocated at time of resume, that travels through memory not to
103    collide with anything.
104
105    Warning: this is even more evil than it seems. Pagedirs this file
106    talks about are completely different from page directories used by
107    MMU hardware.
108  */
109 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
110 static suspend_pagedir_t *pagedir_save;
111 static int pagedir_order __nosavedata = 0;
112
113 struct link {
114         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
115         swp_entry_t next;
116 };
117
118 union diskpage {
119         union swap_header swh;
120         struct link link;
121         struct suspend_header sh;
122 };
123
124 /*
125  * XXX: We try to keep some more pages free so that I/O operations succeed
126  * without paging. Might this be more?
127  */
128 #define PAGES_FOR_IO    512
129
130 static const char name_suspend[] = "Suspend Machine: ";
131 static const char name_resume[] = "Resume Machine: ";
132
133 /*
134  * Debug
135  */
136 #define DEBUG_DEFAULT
137 #undef  DEBUG_PROCESS
138 #undef  DEBUG_SLOW
139 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
140
141 #ifdef DEBUG_DEFAULT
142 # define PRINTK(f, a...)        printk(f, ## a)
143 #else
144 # define PRINTK(f, a...)        do { } while(0)
145 #endif
146
147 #ifdef DEBUG_SLOW
148 #define MDELAY(a) mdelay(a)
149 #else
150 #define MDELAY(a) do { } while(0)
151 #endif
152
153 /*
154  * Saving part...
155  */
156
157 static __inline__ int fill_suspend_header(struct suspend_header *sh)
158 {
159         memset((char *)sh, 0, sizeof(*sh));
160
161         sh->version_code = LINUX_VERSION_CODE;
162         sh->num_physpages = num_physpages;
163         strncpy(sh->machine, system_utsname.machine, 8);
164         strncpy(sh->version, system_utsname.version, 20);
165         /* FIXME: Is this bogus? --RR */
166         sh->num_cpus = num_online_cpus();
167         sh->page_size = PAGE_SIZE;
168         sh->suspend_pagedir = pagedir_nosave;
169         BUG_ON (pagedir_save != pagedir_nosave);
170         sh->num_pbes = nr_copy_pages;
171         /* TODO: needed? mounted fs' last mounted date comparison
172          * [so they haven't been mounted since last suspend.
173          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
174          */
175         return 0;
176 }
177
178 /* We memorize in swapfile_used what swap devices are used for suspension */
179 #define SWAPFILE_UNUSED    0
180 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
181 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
182
183 static unsigned short swapfile_used[MAX_SWAPFILES];
184 static unsigned short root_swap;
185 #define MARK_SWAP_SUSPEND 0
186 #define MARK_SWAP_RESUME 2
187
188 static void mark_swapfiles(swp_entry_t prev, int mode)
189 {
190         swp_entry_t entry;
191         union diskpage *cur;
192         struct page *page;
193
194         if (root_swap == 0xFFFF)  /* ignored */
195                 return;
196
197         page = alloc_page(GFP_ATOMIC);
198         if (!page)
199                 panic("Out of memory in mark_swapfiles");
200         cur = page_address(page);
201         /* XXX: this is dirty hack to get first page of swap file */
202         entry = swp_entry(root_swap, 0);
203         rw_swap_page_sync(READ, entry, page);
204
205         if (mode == MARK_SWAP_RESUME) {
206                 if (!memcmp("S1",cur->swh.magic.magic,2))
207                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
208                 else if (!memcmp("S2",cur->swh.magic.magic,2))
209                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
210                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
211                         name_resume, cur->swh.magic.magic);
212         } else {
213                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
214                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
215                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
216                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
217                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
218                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
219                 /* link.next lies *no more* in last 4/8 bytes of magic */
220         }
221         rw_swap_page_sync(WRITE, entry, page);
222         __free_page(page);
223 }
224
225
226 /*
227  * Check whether the swap device is the specified resume
228  * device, irrespective of whether they are specified by
229  * identical names.
230  *
231  * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
232  * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
233  * and they'll be considered the same device.  This is *necessary* for
234  * devfs, since the resume code can only recognize the form /dev/hda4,
235  * but the suspend code would see the long name.)
236  */
237 static int is_resume_device(const struct swap_info_struct *swap_info)
238 {
239         struct file *file = swap_info->swap_file;
240         struct inode *inode = file->f_dentry->d_inode;
241
242         return S_ISBLK(inode->i_mode) &&
243                 resume_device == MKDEV(imajor(inode), iminor(inode));
244 }
245
246 static void read_swapfiles(void) /* This is called before saving image */
247 {
248         int i, len;
249         
250         len=strlen(resume_file);
251         root_swap = 0xFFFF;
252         
253         swap_list_lock();
254         for(i=0; i<MAX_SWAPFILES; i++) {
255                 if (swap_info[i].flags == 0) {
256                         swapfile_used[i]=SWAPFILE_UNUSED;
257                 } else {
258                         if(!len) {
259                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
260                                 if(root_swap == 0xFFFF) {
261                                         swapfile_used[i] = SWAPFILE_SUSPEND;
262                                         root_swap = i;
263                                 } else
264                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
265                         } else {
266                                 /* we ignore all swap devices that are not the resume_file */
267                                 if (is_resume_device(&swap_info[i])) {
268                                         swapfile_used[i] = SWAPFILE_SUSPEND;
269                                         root_swap = i;
270                                 } else {
271                                         swapfile_used[i] = SWAPFILE_IGNORED;
272                                 }
273                         }
274                 }
275         }
276         swap_list_unlock();
277 }
278
279 static void lock_swapdevices(void) /* This is called after saving image so modification
280                                       will be lost after resume... and that's what we want. */
281 {
282         int i;
283
284         swap_list_lock();
285         for(i = 0; i< MAX_SWAPFILES; i++)
286                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
287                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
288                                                        lock_swapdevices can unlock the devices. */
289                 }
290         swap_list_unlock();
291 }
292
293 /**
294  *    write_suspend_image - Write entire image to disk.
295  *
296  *    After writing suspend signature to the disk, suspend may no
297  *    longer fail: we have ready-to-run image in swap, and rollback
298  *    would happen on next reboot -- corrupting data.
299  *
300  *    Note: The buffer we allocate to use to write the suspend header is
301  *    not freed; its not needed since the system is going down anyway
302  *    (plus it causes an oops and I'm lazy^H^H^H^Htoo busy).
303  */
304 static int write_suspend_image(void)
305 {
306         int i;
307         swp_entry_t entry, prev = { 0 };
308         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
309         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
310         unsigned long address;
311         struct page *page;
312
313         if (!buffer)
314                 return -ENOMEM;
315
316         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
317         for (i=0; i<nr_copy_pages; i++) {
318                 if (!(i%100))
319                         printk( "." );
320                 entry = get_swap_page();
321                 if (!entry.val)
322                         panic("\nNot enough swapspace when writing data" );
323                 
324                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
325                         panic("\nPage %d: not enough swapspace on suspend device", i );
326             
327                 address = (pagedir_nosave+i)->address;
328                 page = virt_to_page(address);
329                 rw_swap_page_sync(WRITE, entry, page);
330                 (pagedir_nosave+i)->swap_address = entry;
331         }
332         printk( "|\n" );
333         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
334         for (i=0; i<nr_pgdir_pages; i++) {
335                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
336                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
337                 printk( "." );
338                 entry = get_swap_page();
339                 if (!entry.val) {
340                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
341                         panic("Don't know how to recover");
342                         free_page((unsigned long) buffer);
343                         return -ENOSPC;
344                 }
345
346                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
347                         panic("\nNot enough swapspace for pagedir on suspend device" );
348
349                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
350                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
351
352                 cur->link.next = prev;                          
353                 page = virt_to_page((unsigned long)cur);
354                 rw_swap_page_sync(WRITE, entry, page);
355                 prev = entry;
356         }
357         printk("H");
358         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
359         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
360         BUG_ON (sizeof(struct link) != PAGE_SIZE);
361         entry = get_swap_page();
362         if (!entry.val)
363                 panic( "\nNot enough swapspace when writing header" );
364         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
365                 panic("\nNot enough swapspace for header on suspend device" );
366
367         cur = (void *) buffer;
368         if (fill_suspend_header(&cur->sh))
369                 BUG();          /* Not a BUG_ON(): we want fill_suspend_header to be called, always */
370                 
371         cur->link.next = prev;
372
373         page = virt_to_page((unsigned long)cur);
374         rw_swap_page_sync(WRITE, entry, page);
375         prev = entry;
376
377         printk( "S" );
378         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
379         printk( "|\n" );
380
381         MDELAY(1000);
382         return 0;
383 }
384
385 #ifdef CONFIG_HIGHMEM
386 struct highmem_page {
387         char *data;
388         struct page *page;
389         struct highmem_page *next;
390 };
391
392 struct highmem_page *highmem_copy = NULL;
393
394 static int save_highmem_zone(struct zone *zone)
395 {
396         unsigned long zone_pfn;
397         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
398                 struct page *page;
399                 struct highmem_page *save;
400                 void *kaddr;
401                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
402                 int chunk_size;
403
404                 if (!(pfn%1000))
405                         printk(".");
406                 if (!pfn_valid(pfn))
407                         continue;
408                 page = pfn_to_page(pfn);
409                 /*
410                  * This condition results from rvmalloc() sans vmalloc_32()
411                  * and architectural memory reservations. This should be
412                  * corrected eventually when the cases giving rise to this
413                  * are better understood.
414                  */
415                 if (PageReserved(page)) {
416                         printk("highmem reserved page?!\n");
417                         continue;
418                 }
419                 if ((chunk_size = is_head_of_free_region(page))) {
420                         pfn += chunk_size - 1;
421                         zone_pfn += chunk_size - 1;
422                         continue;
423                 }
424                 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
425                 if (!save)
426                         return -ENOMEM;
427                 save->next = highmem_copy;
428                 save->page = page;
429                 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
430                 if (!save->data) {
431                         kfree(save);
432                         return -ENOMEM;
433                 }
434                 kaddr = kmap_atomic(page, KM_USER0);
435                 memcpy(save->data, kaddr, PAGE_SIZE);
436                 kunmap_atomic(kaddr, KM_USER0);
437                 highmem_copy = save;
438         }
439         return 0;
440 }
441
442 static int save_highmem(void)
443 {
444         struct zone *zone;
445         int res = 0;
446         for_each_zone(zone) {
447                 if (is_highmem(zone))
448                         res = save_highmem_zone(zone);
449                 if (res)
450                         return res;
451         }
452         return 0;
453 }
454
455 static int restore_highmem(void)
456 {
457         while (highmem_copy) {
458                 struct highmem_page *save = highmem_copy;
459                 void *kaddr;
460                 highmem_copy = save->next;
461
462                 kaddr = kmap_atomic(save->page, KM_USER0);
463                 memcpy(kaddr, save->data, PAGE_SIZE);
464                 kunmap_atomic(kaddr, KM_USER0);
465                 free_page((long) save->data);
466                 kfree(save);
467         }
468         return 0;
469 }
470 #endif
471
472 static int pfn_is_nosave(unsigned long pfn)
473 {
474         unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
475         unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
476         return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
477 }
478
479 /* if *pagedir_p != NULL it also copies the counted pages */
480 static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
481 {
482         unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
483         struct pbe *pbe = *pagedir_p;
484         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
485                 struct page *page;
486                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
487
488                 if (!(pfn%1000))
489                         printk(".");
490                 if (!pfn_valid(pfn))
491                         continue;
492                 page = pfn_to_page(pfn);
493                 BUG_ON(PageReserved(page) && PageNosave(page));
494                 if (PageNosave(page))
495                         continue;
496                 if (PageReserved(page) && pfn_is_nosave(pfn)) {
497                         PRINTK("[nosave pfn 0x%lx]", pfn);
498                         continue;
499                 }
500                 if ((chunk_size = is_head_of_free_region(page))) {
501                         pfn += chunk_size - 1;
502                         zone_pfn += chunk_size - 1;
503                         continue;
504                 }
505                 nr_copy_pages++;
506                 if (!pbe)
507                         continue;
508                 pbe->orig_address = (long) page_address(page);
509                 /* Copy page is dangerous: it likes to mess with
510                    preempt count on specific cpus. Wrong preempt count is then copied,
511                    oops. */
512                 copy_page((void *)pbe->address, (void *)pbe->orig_address);
513                 pbe++;
514         }
515         *pagedir_p = pbe;
516         return nr_copy_pages;
517 }
518
519 static int count_and_copy_data_pages(struct pbe *pagedir_p)
520 {
521         int nr_copy_pages = 0;
522         struct zone *zone;
523         for_each_zone(zone) {
524                 if (!is_highmem(zone))
525                         nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
526         }
527         return nr_copy_pages;
528 }
529
530 static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
531 {
532         unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
533         pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
534         pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
535         pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
536         for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
537                 struct page *page;
538                 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
539                 if (!pfn_valid(pfn))
540                         continue;
541                 page = pfn_to_page(pfn);
542                 if (!TestClearPageNosave(page))
543                         continue;
544                 else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
545                         continue;
546                 __free_page(page);
547         }
548 }
549
550 static void free_suspend_pagedir(unsigned long this_pagedir)
551 {
552         struct zone *zone;
553         for_each_zone(zone) {
554                 if (!is_highmem(zone))
555                         free_suspend_pagedir_zone(zone, this_pagedir);
556         }
557         free_pages(this_pagedir, pagedir_order);
558 }
559
560 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
561 {
562         int i;
563         suspend_pagedir_t *pagedir;
564         struct pbe *p;
565         struct page *page;
566
567         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
568
569         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
570         if (!pagedir)
571                 return NULL;
572
573         page = virt_to_page(pagedir);
574         for(i=0; i < 1<<pagedir_order; i++)
575                 SetPageNosave(page++);
576                 
577         while(nr_copy_pages--) {
578                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
579                 if (!p->address) {
580                         free_suspend_pagedir((unsigned long) pagedir);
581                         return NULL;
582                 }
583                 SetPageNosave(virt_to_page(p->address));
584                 p->orig_address = 0;
585                 p++;
586         }
587         return pagedir;
588 }
589
590 static int prepare_suspend_processes(void)
591 {
592         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
593         if (freeze_processes()) {
594                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
595                 thaw_processes();
596                 return 1;
597         }
598         return 0;
599 }
600
601 /*
602  * Try to free as much memory as possible, but do not OOM-kill anyone
603  *
604  * Notice: all userland should be stopped at this point, or livelock is possible.
605  */
606 static void free_some_memory(void)
607 {
608         printk("Freeing memory: ");
609         while (shrink_all_memory(10000))
610                 printk(".");
611         printk("|\n");
612 }
613
614 static int suspend_prepare_image(void)
615 {
616         struct sysinfo i;
617         unsigned int nr_needed_pages = 0;
618
619         pagedir_nosave = NULL;
620         printk( "/critical section: ");
621 #ifdef CONFIG_HIGHMEM
622         printk( "handling highmem" );
623         if (save_highmem()) {
624                 printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
625                 return -ENOMEM;
626         }
627         printk(", ");
628 #endif
629
630         printk("counting pages to copy" );
631         drain_local_pages();
632         nr_copy_pages = count_and_copy_data_pages(NULL);
633         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
634         
635         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
636         if(nr_free_pages() < nr_needed_pages) {
637                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
638                        name_suspend, nr_needed_pages-nr_free_pages());
639                 root_swap = 0xFFFF;
640                 return -ENOMEM;
641         }
642         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
643                                    We should only consider resume_device. */
644         if (i.freeswap < nr_needed_pages)  {
645                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
646                        name_suspend, nr_needed_pages-i.freeswap);
647                 return -ENOSPC;
648         }
649
650         PRINTK( "Alloc pagedir\n" ); 
651         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
652         if (!pagedir_nosave) {
653                 /* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
654                 printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
655                 return -ENOMEM;
656         }
657         nr_copy_pages_check = nr_copy_pages;
658         pagedir_order_check = pagedir_order;
659
660         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
661         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
662                 BUG();
663
664         /*
665          * End of critical section. From now on, we can write to memory,
666          * but we should not touch disk. This specially means we must _not_
667          * touch swap space! Except we must write out our image of course.
668          */
669
670         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
671         return 0;
672 }
673
674 static void suspend_save_image(void)
675 {
676         device_resume();
677
678         lock_swapdevices();
679         write_suspend_image();
680         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
681
682         /* It is important _NOT_ to umount filesystems at this point. We want
683          * them synced (in case something goes wrong) but we DO not want to mark
684          * filesystem clean: it is not. (And it does not matter, if we resume
685          * correctly, we'll mark system clean, anyway.)
686          */
687 }
688
689 static void suspend_power_down(void)
690 {
691         extern int C_A_D;
692         C_A_D = 0;
693         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
694 #ifdef CONFIG_VT
695         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
696         mdelay(1000);
697         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
698                 machine_restart(NULL);
699         else
700 #endif
701         {
702                 device_suspend(3);
703                 device_shutdown();
704                 machine_power_off();
705         }
706
707         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
708         machine_halt();
709         while (1);
710         /* NOTREACHED */
711 }
712
713 /*
714  * Magic happens here
715  */
716
717 asmlinkage void do_magic_resume_1(void)
718 {
719         barrier();
720         mb();
721         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
722
723         device_power_down(3);
724         PRINTK( "Waiting for DMAs to settle down...\n");
725         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
726                            Do it with disabled interrupts for best effect. That way, if some
727                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
728 }
729
730 asmlinkage void do_magic_resume_2(void)
731 {
732         BUG_ON (nr_copy_pages_check != nr_copy_pages);
733         BUG_ON (pagedir_order_check != pagedir_order);
734
735         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
736
737         PRINTK( "Freeing prev allocated pagedir\n" );
738         free_suspend_pagedir((unsigned long) pagedir_save);
739
740 #ifdef CONFIG_HIGHMEM
741         printk( "Restoring highmem\n" );
742         restore_highmem();
743 #endif
744         printk("done, devices\n");
745
746         device_power_up();
747         spin_unlock_irq(&suspend_pagedir_lock);
748         device_resume();
749
750         /* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
751         PRINTK( "Fixing swap signatures... " );
752         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
753         PRINTK( "ok\n" );
754
755 #ifdef SUSPEND_CONSOLE
756         acquire_console_sem();
757         update_screen(fg_console);
758         release_console_sem();
759 #endif
760 }
761
762 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
763
764         if (!resume) {
765                 do_magic_suspend_1();
766                 save_processor_state();
767                 SAVE_REGISTERS
768                 do_magic_suspend_2();
769                 return;
770         }
771         GO_TO_SWAPPER_PAGE_TABLES
772         do_magic_resume_1();
773         COPY_PAGES_BACK
774         RESTORE_REGISTERS
775         restore_processor_state();
776         do_magic_resume_2();
777
778  */
779
780 asmlinkage void do_magic_suspend_1(void)
781 {
782         mb();
783         barrier();
784         BUG_ON(in_atomic());
785         spin_lock_irq(&suspend_pagedir_lock);
786 }
787
788 asmlinkage void do_magic_suspend_2(void)
789 {
790         int is_problem;
791         read_swapfiles();
792         device_power_down(3);
793         is_problem = suspend_prepare_image();
794         device_power_up();
795         spin_unlock_irq(&suspend_pagedir_lock);
796         if (!is_problem) {
797                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
798                 BUG_ON(in_atomic());
799                 suspend_save_image();
800                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
801         }
802
803         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
804         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
805
806         barrier();
807         mb();
808         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
809
810         free_pages((unsigned long) pagedir_nosave, pagedir_order);
811         spin_unlock_irq(&suspend_pagedir_lock);
812
813         device_resume();
814         PRINTK( "Fixing swap signatures... " );
815         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
816         PRINTK( "ok\n" );
817 }
818
819 /*
820  * This is main interface to the outside world. It needs to be
821  * called from process context.
822  */
823 int software_suspend(void)
824 {
825         int res;
826         if (!software_suspend_enabled)
827                 return -EAGAIN;
828
829         software_suspend_enabled = 0;
830         might_sleep();
831
832         if (arch_prepare_suspend()) {
833                 printk("%sArchitecture failed to prepare\n", name_suspend);
834                 return -EPERM;
835         }               
836         if (pm_prepare_console())
837                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
838         if (!prepare_suspend_processes()) {
839
840                 /* At this point, all user processes and "dangerous"
841                    kernel threads are stopped. Free some memory, as we
842                    need half of memory free. */
843
844                 free_some_memory();
845                 disable_nonboot_cpus();
846                 /* Save state of all device drivers, and stop them. */
847                 printk("Suspending devices... ");
848                 if ((res = device_suspend(3))==0) {
849                         /* If stopping device drivers worked, we proceed basically into
850                          * suspend_save_image.
851                          *
852                          * do_magic(0) returns after system is resumed.
853                          *
854                          * do_magic() copies all "used" memory to "free" memory, then
855                          * unsuspends all device drivers, and writes memory to disk
856                          * using normal kernel mechanism.
857                          */
858                         do_magic(0);
859                 }
860                 thaw_processes();
861                 enable_nonboot_cpus();
862         } else
863                 res = -EBUSY;
864         software_suspend_enabled = 1;
865         MDELAY(1000);
866         pm_restore_console();
867         return res;
868 }
869
870 /* More restore stuff */
871
872 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
873
874 /*
875  * Returns true if given address/order collides with any orig_address 
876  */
877 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
878                 int order)
879 {
880         int i;
881         unsigned long addre = addr + (PAGE_SIZE<<order);
882         
883         for(i=0; i < nr_copy_pages; i++)
884                 if((pagedir+i)->orig_address >= addr &&
885                         (pagedir+i)->orig_address < addre)
886                         return 1;
887
888         return 0;
889 }
890
891 /*
892  * We check here that pagedir & pages it points to won't collide with pages
893  * where we're going to restore from the loaded pages later
894  */
895 static int check_pagedir(void)
896 {
897         int i;
898
899         for(i=0; i < nr_copy_pages; i++) {
900                 unsigned long addr;
901
902                 do {
903                         addr = get_zeroed_page(GFP_ATOMIC);
904                         if(!addr)
905                                 return -ENOMEM;
906                 } while (does_collide(addr));
907
908                 (pagedir_nosave+i)->address = addr;
909         }
910         return 0;
911 }
912
913 static int relocate_pagedir(void)
914 {
915         /*
916          * We have to avoid recursion (not to overflow kernel stack),
917          * and that's why code looks pretty cryptic 
918          */
919         suspend_pagedir_t *old_pagedir = pagedir_nosave;
920         void **eaten_memory = NULL;
921         void **c = eaten_memory, *m, *f;
922         int ret = 0;
923
924         printk("Relocating pagedir ");
925
926         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
927                 printk("not necessary\n");
928                 return 0;
929         }
930
931         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order)) != NULL) {
932                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
933                         break;
934                 eaten_memory = m;
935                 printk( "." ); 
936                 *eaten_memory = c;
937                 c = eaten_memory;
938         }
939
940         if (!m) {
941                 printk("out of memory\n");
942                 ret = -ENOMEM;
943         } else {
944                 pagedir_nosave =
945                         memcpy(m, old_pagedir, PAGE_SIZE << pagedir_order);
946         }
947
948         c = eaten_memory;
949         while (c) {
950                 printk(":");
951                 f = c;
952                 c = *c;
953                 free_pages((unsigned long)f, pagedir_order);
954         }
955         printk("|\n");
956         return ret;
957 }
958
959 /*
960  * Sanity check if this image makes sense with this kernel/swap context
961  * I really don't think that it's foolproof but more than nothing..
962  */
963
964 static int sanity_check_failed(char *reason)
965 {
966         printk(KERN_ERR "%s%s\n", name_resume, reason);
967         return -EPERM;
968 }
969
970 static int sanity_check(struct suspend_header *sh)
971 {
972         if (sh->version_code != LINUX_VERSION_CODE)
973                 return sanity_check_failed("Incorrect kernel version");
974         if (sh->num_physpages != num_physpages)
975                 return sanity_check_failed("Incorrect memory size");
976         if (strncmp(sh->machine, system_utsname.machine, 8))
977                 return sanity_check_failed("Incorrect machine type");
978         if (strncmp(sh->version, system_utsname.version, 20))
979                 return sanity_check_failed("Incorrect version");
980         if (sh->num_cpus != num_online_cpus())
981                 return sanity_check_failed("Incorrect number of cpus");
982         if (sh->page_size != PAGE_SIZE)
983                 return sanity_check_failed("Incorrect PAGE_SIZE");
984         return 0;
985 }
986
987 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
988 {
989         struct buffer_head *bh;
990         BUG_ON (pos%PAGE_SIZE);
991         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
992         if (!bh || (!bh->b_data)) {
993                 return -1;
994         }
995         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
996         BUG_ON(!buffer_uptodate(bh));
997         brelse(bh);
998         return 0;
999
1000
1001 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
1002 {
1003 #if 0
1004         struct buffer_head *bh;
1005         BUG_ON (pos%PAGE_SIZE);
1006         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
1007         if (!bh || (!bh->b_data)) {
1008                 return -1;
1009         }
1010         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
1011         BUG_ON(!buffer_uptodate(bh));
1012         generic_make_request(WRITE, bh);
1013         if (!buffer_uptodate(bh))
1014                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
1015         wait_on_buffer(bh);
1016         brelse(bh);
1017         return 0;
1018 #endif
1019         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
1020         return 0;
1021 }
1022
1023 extern dev_t __init name_to_dev_t(const char *line);
1024
1025 static int __init __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
1026 {
1027         swp_entry_t next;
1028         int i, nr_pgdir_pages;
1029
1030 #define PREPARENEXT \
1031         {       next = cur->link.next; \
1032                 next.val = swp_offset(next) * PAGE_SIZE; \
1033         }
1034
1035         if (bdev_read_page(bdev, 0, cur)) return -EIO;
1036
1037         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
1038             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
1039                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
1040                 return -EINVAL;
1041         }
1042
1043         PREPARENEXT; /* We have to read next position before we overwrite it */
1044
1045         if (!memcmp("S1",cur->swh.magic.magic,2))
1046                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
1047         else if (!memcmp("S2",cur->swh.magic.magic,2))
1048                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
1049         else {
1050                 if (noresume)
1051                         return -EINVAL;
1052                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
1053                         name_resume, cur->swh.magic.magic);
1054         }
1055         if (noresume) {
1056                 /* We don't do a sanity check here: we want to restore the swap
1057                    whatever version of kernel made the suspend image;
1058                    We need to write swap, but swap is *not* enabled so
1059                    we must write the device directly */
1060                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
1061                 bdev_write_page(bdev, 0, cur);
1062         }
1063
1064         printk( "%sSignature found, resuming\n", name_resume );
1065         MDELAY(1000);
1066
1067         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1068         if (sanity_check(&cur->sh))     /* Is this same machine? */     
1069                 return -EPERM;
1070         PREPARENEXT;
1071
1072         pagedir_save = cur->sh.suspend_pagedir;
1073         nr_copy_pages = cur->sh.num_pbes;
1074         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
1075         pagedir_order = get_bitmask_order(nr_pgdir_pages);
1076
1077         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
1078         if (!pagedir_nosave)
1079                 return -ENOMEM;
1080
1081         PRINTK( "%sReading pagedir, ", name_resume );
1082
1083         /* We get pages in reverse order of saving! */
1084         for (i=nr_pgdir_pages-1; i>=0; i--) {
1085                 BUG_ON (!next.val);
1086                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
1087                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1088                 PREPARENEXT;
1089         }
1090         BUG_ON (next.val);
1091
1092         if (relocate_pagedir())
1093                 return -ENOMEM;
1094         if (check_pagedir())
1095                 return -ENOMEM;
1096
1097         printk( "Reading image data (%d pages): ", nr_copy_pages );
1098         for(i=0; i < nr_copy_pages; i++) {
1099                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
1100                 if (!(i%100))
1101                         printk( "." );
1102                 /* You do not need to check for overlaps...
1103                    ... check_pagedir already did this work */
1104                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
1105                         return -EIO;
1106         }
1107         printk( "|\n" );
1108         return 0;
1109 }
1110
1111 static int __init read_suspend_image(const char * specialfile, int noresume)
1112 {
1113         union diskpage *cur;
1114         unsigned long scratch_page = 0;
1115         int error;
1116         char b[BDEVNAME_SIZE];
1117
1118         resume_device = name_to_dev_t(specialfile);
1119         scratch_page = get_zeroed_page(GFP_ATOMIC);
1120         cur = (void *) scratch_page;
1121         if (cur) {
1122                 struct block_device *bdev;
1123                 printk("Resuming from device %s\n",
1124                                 __bdevname(resume_device, b));
1125                 bdev = open_by_devnum(resume_device, FMODE_READ);
1126                 if (IS_ERR(bdev)) {
1127                         error = PTR_ERR(bdev);
1128                 } else {
1129                         set_blocksize(bdev, PAGE_SIZE);
1130                         error = __read_suspend_image(bdev, cur, noresume);
1131                         blkdev_put(bdev);
1132                 }
1133         } else error = -ENOMEM;
1134
1135         if (scratch_page)
1136                 free_page(scratch_page);
1137         switch (error) {
1138                 case 0:
1139                         PRINTK("Reading resume file was successful\n");
1140                         break;
1141                 case -EINVAL:
1142                         break;
1143                 case -EIO:
1144                         printk( "%sI/O error\n", name_resume);
1145                         break;
1146                 case -ENOENT:
1147                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1148                         break;
1149                 case -ENOMEM:
1150                         printk( "%sNot enough memory\n", name_resume);
1151                         break;
1152                 default:
1153                         printk( "%sError %d resuming\n", name_resume, error );
1154         }
1155         MDELAY(1000);
1156         return error;
1157 }
1158
1159 /**
1160  *      software_resume - Resume from a saved image.
1161  *
1162  *      Called as a late_initcall (so all devices are discovered and 
1163  *      initialized), we call swsusp to see if we have a saved image or not.
1164  *      If so, we quiesce devices, then restore the saved image. We will 
1165  *      return above (in pm_suspend_disk() ) if everything goes well. 
1166  *      Otherwise, we fail gracefully and return to the normally 
1167  *      scheduled program.
1168  *
1169  */
1170 static int __init software_resume(void)
1171 {
1172         if (num_online_cpus() > 1) {
1173                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1174                 return -EINVAL;
1175         }
1176         /* We enable the possibility of machine suspend */
1177         software_suspend_enabled = 1;
1178         if (!resume_status)
1179                 return 0;
1180
1181         printk( "%s", name_resume );
1182         if (resume_status == NORESUME) {
1183                 if(resume_file[0])
1184                         read_suspend_image(resume_file, 1);
1185                 printk( "disabled\n" );
1186                 return 0;
1187         }
1188         MDELAY(1000);
1189
1190         if (pm_prepare_console())
1191                 printk("swsusp: Can't allocate a console... proceeding\n");
1192
1193         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1194                 printk( "suspension device unspecified\n" );
1195                 return -EINVAL;
1196         }
1197
1198         printk( "resuming from %s\n", resume_file);
1199         if (read_suspend_image(resume_file, 0))
1200                 goto read_failure;
1201         /* FIXME: Should we stop processes here, just to be safer? */
1202         disable_nonboot_cpus();
1203         device_suspend(3);
1204         do_magic(1);
1205         panic("This never returns");
1206
1207 read_failure:
1208         pm_restore_console();
1209         return 0;
1210 }
1211
1212 late_initcall(software_resume);
1213
1214 static int __init resume_setup(char *str)
1215 {
1216         if (resume_status == NORESUME)
1217                 return 1;
1218
1219         strncpy( resume_file, str, 255 );
1220         resume_status = RESUME_SPECIFIED;
1221
1222         return 1;
1223 }
1224
1225 static int __init noresume_setup(char *str)
1226 {
1227         resume_status = NORESUME;
1228         return 1;
1229 }
1230
1231 __setup("noresume", noresume_setup);
1232 __setup("resume=", resume_setup);
1233
1234 EXPORT_SYMBOL(software_suspend);
1235 EXPORT_SYMBOL(software_suspend_enabled);